本文来源吾爱破解论坛
本帖最后由 17788210295 于 2019-8-22 14:18 编辑
利用 pool.map 进程池 高效爬取大图片 大图片的意思就是 最下面那张图.... 记的给个评分,评论哟
编码格式为 gb2312
不说了上代码:
[Python] 纯文本查看 复制代码
# coding=gb2312 import requests from lxml import etree from multiprocessing import Pool import os from time import sleep import random class Down_pic(): def __init__(self): self.headers = { "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3", "Accept-Encoding": "gzip, deflate", "Accept-Language": "zh-CN,zh;q=0.9,en;q=0.8", "Cache-Control": "max-age=0", "Connection": "keep-alive", "Host": "desk.zol.com.cn", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36", } self.main_url = 'http://desk.zol.com.cn' # 主页 self.dic = {} # 存放图片路径 self.count = 0 # 总得图片数量 self.k = 0 # 用来取datas数据 切片 self.type = 5 # 大分类取 5个 self.small_type_num = 4 # 小分类取4个 self.tupian = 3 # 每一个小分类下载的图片数量 (最多3个) def get_tree(self, htlm): tree = etree.HTML(htlm) return tree # 获取大分类 def get_type(self): main_page = requests.get(self.main_url, headers=self.headers).text tree = self.get_tree(main_page) a_list = tree.xpath('//*[@id="main"]/dl[1]/dd/a') # 去掉全部这个分类 a_list.pop(0) for a in a_list[0:self.type]: type_name = a.xpath('./text()')[0] type_url = self.main_url + a.xpath('./@href')[0] yield type_name, type_url # 获取小分类 def get_small_type(self): for type_name, type_url in self.get_type(): small_page = requests.get(type_url) small_page.encoding = 'gb2312' tree = self.get_tree(small_page.text) # 获取小分类 small_name_list = tree.xpath('//a[@class="pic"]/span/@title')[0:self.small_type_num] # 小分类取3个 small_url_list = tree.xpath('//a[@class="pic"]/@href')[0:self.small_type_num] self.dic[type_name] = small_name_list yield small_url_list # 获取图片列表 def get_pic_list(self): for pic_page_url in self.get_small_type(): for pic_url in pic_page_url: url = self.main_url + pic_url pic_page = requests.get(url=url).text etree = self.get_tree(pic_page) pic_list_url = etree.xpath('//*[@id="showImg"]/li/a/@href') for pic_url in pic_list_url[:self.tupian]: # 每一个小分类 取几个图片 yield self.main_url + pic_url # 获取图片尺寸网址页面 def get_size(self): for pic_url in self.get_pic_list(): pic_page = requests.get(pic_url).text etree = self.get_tree(pic_page) try: data_url = self.main_url + etree.xpath('//*[@id="tagfbl"]/a[2]/@href')[0] # 多数默认2880*1800 except Exception: data_url = etree.xpath('//*[@id="bigImg"]/@src')[0] yield data_url # 获取图片下载地址 def get_data(self): for url in self.get_size(): data_page = requests.get(url).text etree = self.get_tree(data_page) try: pic_data_url = etree.xpath('/html/body/img[1]/@src')[0] except Exception: pic_data_url = url self.count += 1 yield pic_data_url self.num = self.count # 开启线程下载 def ppp(self): print('开启线程') pool = Pool(5) datas = pool.map(self.download, [url for url in self.get_data()]) pool.close() pool.join() for type_name in self.dic: for small_name in self.dic[type_name]: path = type_name + '/' + small_name path = path[:path.find('?')] if not os.path.exists(path): os.makedirs(path) for data in datas[self.k:self.k + self.tupian]: name = small_name + str(random.randint(1, 1000)) # 图片名 pa = path + '/' + name + '.jpg' with open(pa, 'wb') as f: f.write(data) self.k += self.tupian print('共下载:{}图片'.format(self.count)) def download(self, url): # print('\r当前下载进度:{}%'.format((1 - self.num / self.count) * 100), end='') data = requests.get(url=url).content sleep(1) return data if __name__ == '__main__': down = Down_pic() down.ppp()
注:此文章所有内容仅供学习,不允许商用,如有侵权,请联系删除,谢谢。...
Snipaste_2019-08-02_16-04-10.jpg (39.67 KB, 下载次数: 43)
下载附件 保存到相册
2019-8-2 16:07 上传
多少都有
Snipaste_2019-08-02_17-54-56.jpg (145.51 KB, 下载次数: 15)
下载附件 保存到相册
2019-8-2 18:18 上传
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: python爬去微博获得微博个人页面所有相关信息
- 下一篇: json 的简单应用