本文来源吾爱破解论坛
第一次发帖,也不知道编辑得看得下去,需要安装得第三方模块就requests和lxml
import os, time
import threading
import requests
from queue import Queue
from lxml import etree
def runforever(func): #使方法一直运行
def forever(obje):
while True:
time.sleep(1)
func(obje)
return forever
class MZiTuSpider():
def __init__(self,path):
self.PATH = path
self.main_url_queue = Queue() #构建要爬多少页的队列
self.chapter_url_queue = Queue() #构建每页有多少主题的队列
self.pic_url_queue = Queue() #构建图片地址的队列
self.parse_queue = Queue() #解析队列
self.download_queue = Queue() #下载队列
def get_html(self,url):
# 爬这个网站下载图片需要referer要不然会被404,需要加代(过)理(滤)自己添加
headers = {
"Referer": "https://www.mzitu.com/189066",
"Sec-Fetch-Mode": "no-cors",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"
}
res = requests.get(url, headers=headers)
return res
def add_main_url(self):
_url = "https://www.mzitu.com/hot/page/{}/"
for i in range(1,3): #提供多少页 我只设置了爬2页
self.main_url_queue.put(_url.format(i))
@runforever
def add_chapter_url(self):
url = self.main_url_queue.get()
res = self.get_html(url)
if res.status_code != 200:
self.main_url_queue.put(url)
else:
html = etree.HTML(res.text)
chapter_urls = html.xpath("//ul[@id='pins']/li/a/@href")
for chapter_url in chapter_urls:
self.chapter_url_queue.put(chapter_url)
self.main_url_queue.task_done()
@runforever
def add_pic_url(self):
url = self.chapter_url_queue.get()
res = self.get_html(url)
if res.status_code != 200:
self.chapter_url_queue.put(url)
else:
res.encoding = res.apparent_encoding
html = etree.HTML(res.text)
pages = html.xpath("//div[@class='pagenavi']/a[last()-1]/span/text()")[0]
for num in range(1,int(pages)):
self.pic_url_queue.put(url+'/'+str(num))
self.chapter_url_queue.task_done()
@runforever
def parse(self):
url = self.pic_url_queue.get()
res = self.get_html(url)
html = etree.HTML(res.text)
download_url = html.xpath("//div[@class='main-image']/p/a/img/@src")
title = html.xpath("//h2[@class='main-title']/text()")
info = {'title':title,'url':download_url}
self.download_queue.put(info)
self.pic_url_queue.task_done()
@runforever
def download(self):
if os.path.exists(self.PATH) is False:
os.chdir(self.PATH)
os.mkdir(self.PATH)
info = self.download_queue.get()
print("正在抓取 ", info['url'], info['title'])
pic_content = self.get_html(info['url'][0])
with open(self.PATH+'/'+info['title'][0]+'.jpg', 'wb') as f:
f.write(pic_content.content)
print('已下载------> %s' % info['title'])
self.download_queue.task_done()
def run_thread(self,func,count=1):
for _ in range(0, count):
t = threading.Thread(target=func)
t.setDaemon(True)
t.start()
def run(self):
main_url_t = threading.Thread(target=self.add_main_url)
main_url_t.start()
self.run_thread(self.add_chapter_url,count=1) #count 为线程数,速度过快,网站容易挂
self.run_thread(self.add_pic_url,count=1)
self.run_thread(self.parse,count=1)
self.run_thread(self.download,count=1)
self.main_url_queue.join()
self.chapter_url_queue.join()
self.pic_url_queue.join()
self.parse_queue.join()
self.download_queue.join()
if __name__ == '__main__':
PATH = r'D:\MZiTu' #保存路径
m = MZiTuSpider(PATH)
start = time.time()
m.run()
end = time.time()
print("共耗时 %s" %(end-start))``
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: 【教程】终于对吾爱下手了,吾爱简单数据爬虫第二课
- 下一篇: PyQt5初步(2)——本地登录页面