本文来源吾爱破解论坛
本次爬取目标是京东的商品系统环境:Deepin15.7(Linux)
开发环境:Python3.6.5
需要插件:PyMysql、selenium、lxml
使用chrome浏览器模拟用户翻页操作
所以需要下载chrome浏览器对应版本的驱动
我的chrome是68的 下载的chromeDriver 2.41版
chromeDriver点击进入对驱动下载
Win环境需要配置环境变量
配置chrome浏览器
[Python] 纯文本查看 复制代码
# 设置chromedriver不加载图片 chrome_options = webdriver.ChromeOptions() # 使用chrome浏览器模拟浏览网页 prefs = {"profile.managed_default_content_settings.images": 2} chrome_options.add_experimental_option("prefs", prefs) # 设置chrome不加载图片,节省流量并且加快爬取速度 browser = webdriver.Chrome(chrome_options=chrome_options) browser.set_window_size(800, 1000) wait = WebDriverWait(browser, 10) KEYWORDS = ['手机', '笔记本电脑', '硬盘', '内存', '零食', '男装', '跑鞋', 'Air Jordan', '眼镜', '秋裤', '鼠标', '帽子'] # 设置搜索的关键词
连接数据库
[Python] 纯文本查看 复制代码
con = pymysql.connect(host='localhost', port=3306, user='******', # *替换为mysql用户名 password='******', # *替换为mysql用户密码 db='*****', # *替换为mysql数据库名称 cursorclass=pymysql.cursors.DictCursor) cursor = con.cursor()# 配置游标
正文
[Python] 纯文本查看 复制代码
def commit_db(goods): ‘’‘封装提交数据方法’‘’ table = 'jd_phone' keys = '' for key in goods[0].keys(): keys += (key + ', ') keys = keys[:-2] vals_sql = '' for good in goods: vals = '' for key, val in good.items(): if key != 'price': vals += ('"' + val + '"') else: vals += val vals = vals + ', ' vals_sql += ('(' + vals[:-2] + '),') vals_sql = vals_sql[:-1] sql = 'insert into %s(%s) values%s' % (table, keys, vals_sql) try: effect_rows = cursor.execute(sql) except pymysql.err.IntegrityError: effect_rows = 0 if effect_rows: con.commit() def analyze_page(page_source): # 分析页面的内容 etree_html = etree.HTML(page_source) goods_list = [] sku_list = etree_html.xpath('//li[@class="gl-item"]/@data-sku') for sku in sku_list: good = {} title = [] em_obj_list = etree_html.xpath('//li[@data-sku="' + sku + '"]//div[@class="p-name p-name-type-2"]/a/em') img = etree_html.xpath('//li[@data-sku="' + sku + '"]//div[@class="p-img"]/a/img/@src') if not img: img = [''] price = etree_html.xpath('//li[@data-sku="' + sku + '"]//div[@class="p-price"]/strong/i/text()') commit = etree_html.xpath('//li[@data-sku="' + sku + '"]//div[@class="p-commit"]/strong/a/text()') if not commit: commit = [''] shop = etree_html.xpath('//li[@data-sku="' + sku + '"]//div[@class="p-shop"]/span/a/text()') if not shop: shop = [''] for em_obj in em_obj_list: title.append(''.join(em_obj.xpath('.//text()'))) good['itemId'] = sku good['title'] = title[0] good['img'] = img[0] good['price'] = price[0] good['commit'] = commit[0] good['shop'] = shop[0] goods_list.append(good) commit_db(goods_list) def sleep(n): time.sleep(randint(n, n + 2) / 10) def get_page(page, kw): n = randint(10, 15) if page == 1: url = 'https://search.jd.com/Search?keyword=%s&enc=utf-8' % quote(kw) browser.get(url) for i in range(n): # 分n次滚动页面 browser.execute_script( 'window.scrollTo(document.body.scrollHeight/%d*%d, document.body.scrollHeight/%d*%d)' % ( n, i, n, i + 1)) sleep(4) input = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, "#J_bottomPage input.input-txt"))) input.clear() input.send_keys(page + 1) submit = wait.until( EC.element_to_be_clickable((By.CSS_SELECTOR, '#J_bottomPage a.btn.btn-default'))) submit.click() page_source = browser.page_source return page_source def main(): for KEYWORD in KEYWORDS: for page in range(1, 101): page_source = get_page(page, KEYWORD) analyze_page(page_source) print(page) print('spider data complete') cursor.close() con.close() if __name__ == '__main__': main()
有点小bug 就是遇到爬取内容中有双引号的时候就会报错 原因是sql语句被阻断了
不过后面不会优化了 本来就是练手用的
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。