小白的多线程爬虫。以某漫画为例

piaodoo 编程教程 2020-02-22 22:02:51 920 0 python教程

本文来源吾爱破解论坛

#coding:utf-8
import urllib.request as ub
import urllib.parse as parse
import http.cookiejar as cjar
import re
from lxml import etree
from queue import Queue
import threading
import os
import json
import sys

#获取数据的装饰器函数
def get_body(func):
def wrapper(*args, **kwargs):
      for i in range(0, 3):
         try:
            html = func(*args, **kwargs)
         except Exception as e:
            if str(e).find('404')>=0:
                  print("error:{},url:{}".format(str(e), args[1]))
                  return 1
            print("error:{},url:{}".format(str(e),args[1]))
            if i == 2:
                  i = 3
            continue
         else:
            return html
      if i == 3:
         return False
return wrapper
#获取章节中图片地址的线程类
class chaptersThread(threading.Thread):
def __init__(self,chaptersQ,ImagesUrlQ,threadName,spider):
      super(chaptersThread,self).__init__()
      self.chaptersQ = chaptersQ
      self.threadName = threadName
      self.ImagesUrlQ = ImagesUrlQ
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global CHAPTERS_EXIT
      while not CHAPTERS_EXIT:
         try:
            chapterTuple = self.chaptersQ.get(False) #一会处理
         except Exception as e:
            break
         title = chapterTuple[0]
         url = chapterTuple[1]
         url = self.spider.index + url[1:]
         html = self.spider.get_data(url) #返回未处理的html
         if html:
            html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
            imagesUrl=self.parseUrl(html)
            self.ImagesUrlQ.put({title:imagesUrl})
         else:
            print("获取失败{}".format(url))
      print("{}:{}完成工作".format(self.threadName,self.name))
def parseUrl(self,html):
      imagesUrl=[]
      compile = re.compile("chapterImages = (\[.*?\])")
      compile2 = re.compile('chapterPath = "(.*?)"')
      images = json.loads(compile.findall(html)[0])
      im_path = compile2.findall(html)[0]
      im_url = "http://res.gufengmh8.com/"
      for image in images:
         imagesUrl.append(im_url + im_path + image)
      return  imagesUrl
#获取每一章节中的图片的线程类
class ImagesUrlThread(threading.Thread):
def __init__(self,ImagesUrlQ,threadName,spider):
      super(ImagesUrlThread,self).__init__()
      self.ImagesUrlQ = ImagesUrlQ
      self.threadName = threadName
      self.spider = spider
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global IMAGESURL_EXIT
      while not IMAGESURL_EXIT:
         try:
            images_chapter = self.ImagesUrlQ.get(False)
         except:
            break
         title = list(images_chapter.keys())[0]
         images = images_chapter[title]#list类型~~
         try:
            os.mkdir(os.path.join(self.spider.dir_path+title))
         except Exception as e:
            pass
            #print("error:{}".format(str(e)))
         for i in range(len(images)):
            url = images
            imagesIo = self.spider.get_data(url)
            if imagesIo:
                  save_path = self.spider.dir_path + title + "/" + str(i) + ".jpg"
                  with open(save_path, "bw") as file:
                     try:
                        file.write(imagesIo)
                     except:
                        pass
            elif imagesIo==1:
                  pass
            else:
                  global ErrorQ
                  ErrorQ.put({"title":title,"page":str(i),"url":url})
                  print("章节:{},第{}页,url:{},获取失败".format(title,str(i),url))
      print("获取完成{}".format(self.threadName))
#用来从新获取获取失败的章节
class ErrorUrlThrad(threading.Thread):
def __init__(self,ErrorQ,tName,spider):
      super(ErrorUrlThrad,self).__init__()
      self.ErrorQ = ErrorQ
      self.spider = spider
      self.threadName = tName
def run(self):
      print("{}:线程正在工作".format(self.threadName))
      global ERRORU_EXIT
      while not ERRORU_EXIT:
         try:
            error_dict = self.ErrorQ.get(False)
         except Exception as e:
            break

         title = error_dict["title"]
         page =error_dict["page"]
         url = error_dict["url"]

         imageIo = self.spider.get_data(url)
         if imageIo:
            with open(os.path.join(self.spider.dir_path+title + "/" + page + ".jpg"),"wb") as f:
                  f.write(imageIo)
         else:
            print("章节:{},第{}页,url:{},重新获取失败".format(title, page, url))

class MSpider(object):
def __init__(self,index='',CartoonName=None,dir_path=''):
      self.cookie = cjar.CookieJar()
      self.opener = ub.build_opener(ub.HTTPCookieProcessor(self.cookie))
      self.opener.addheaders =[("User-Agent","Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Mobile Safari/537.36")]
      self.index = index #漫画首页
      self.CartoonName = CartoonName
      self.dir_path = dir_path
def get_cookie(self):
      try:
         self.opener.open(self.index,timeout=10)
      except Exception as e:
         print(str(e))
def search_api(self):
      if not isinstance(self.CartoonName,str):
         self.CartoonName = str(self.CartoonName)

      data_dict = {
         'keywords': self.CartoonName
      }
      data = parse.urlencode(data_dict)
      url = self.index+"search/?" + data

      html = self.get_data(url)
      html = re.sub('\xa9|\xbb', '', html.decode(encoding="utf-8"))
      html_xpath = etree.HTML(html)
      try:
         cartoonList =  html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/a/@href')#漫画首页
         update = html_xpath.xpath('//*[@id="update_list"]/div/div/div[2]/p[3]/span[2]/text()')
         for index,date in zip(cartoonList,update):
            print("更新日期:{},漫画链接:{}".format(date,index))
         index = int(input('请根据时间选择你要看的漫画？请输入阿拉伯数字进行选择。'))
         if index<=0:
            index = 1
      except Exception as e:
         print("error:{}".format(str(e)))
         return ""
      return cartoonList[index-1]
#获取章节
def get_chapter(self,index):
      html = self.get_data(index)
      html = re.sub('\xa9|\xbb','',html.decode(encoding="utf-8"))#删除特殊字符
      html_xpath = etree.HTML(html)
      chapters_href = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/@href')
      chapters_title = html_xpath.xpath('//*[@id="chapter-list-1"]/li/a/span/text()')
      chapters_len = len(chapters_title)

      print("""最近更新10章更新:
      {}
      {}""".format(chapters_title[chapters_len-10:chapters_len-5],chapters_title[chapters_len-5:chapters_len]))

      print('因为其中包含特殊章节,并不是每个章节链接和每一话动漫都对应.\n'
            '请自行斟酌要爬去的章节范围.\n\n\n'
            '您搜素漫画一共{}章节,'\
            .format(chapters_len))
      while True:
         try:
            start_page = int(input("请输入起始章节："))
            end_page = int(input("请输入结束章节："))
            if end_page>chapters_len:
                  print("章节超出搜素范围,请重新输入")
                  continue
            elif start_page>end_page:
                  print('起始章节大于结束章节,请重新输入')
                  continue
            elif start_page<1:
                  print("起始章节存在错误")
                  continue
            break
         except Exception as e:
            print('您输入的章节数目格式存在错误请重新出入,Error:{}'.format(str(e)))

      if chapters_len==len(chapters_href):
         chapters = Queue()
         for i in range(start_page-1,end_page):
            chapters.put((chapters_title,chapters_href))
      return chapters

#开始获取章节的函数
def get_oneChapter(self,chaptersQ):
      ImagesUrlQ=Queue()
      tNames = ["cps1","cps2","cps3","cps4","cps5","cps6","cps7","cps8","cps9","cps10"]
      cpts =[]#存储章节爬去线程

      for tName in tNames:
         cpt = chaptersThread(chaptersQ,ImagesUrlQ,tName,self)#创建进程
         cpt.start()
         cpts.append(cpt)
      while not chaptersQ.empty():
         pass

      global CHAPTERS_EXIT
      CHAPTERS_EXIT = True

      for cpt in cpts:
         cpt.join()
      print("章节获取完成,一共获取了{}章漫画".format(ImagesUrlQ.qsize()))
      if ImagesUrlQ.empty():
         print("ImagesUrlQ is empty ,漫画被下架!")
         exit(1)
      Imuts = []#image 获取线程
      t2Names = ["IMUs1","IMUs2","IMUs3","IMUs4","IMUs5","IMUs6","IMUs7","IMUs8","IMUs9","IMUs10",
               "IMUs11", "IMUs12", "IMUs13", "IMUs14", "IMUs15", "IMUs16", "IMUs17", "IMUs18", "IMUs19", "IMUs20"]
      for tName in t2Names:
         Imut = ImagesUrlThread(ImagesUrlQ, tName, self)  # 创建进程
         Imut.start()
         Imuts.append(Imut)
      while not ImagesUrlQ.empty():
         pass
      global IMAGESURL_EXIT
      IMAGESURL_EXIT = True
      for Imut in Imuts:
         Imut.join()
      print("全部获取完成")
@get_body
def get_data(self,*args,**kwargs):
      return self.opener.open(args[0],timeout=30).read()#args[0]=url变量

CHAPTERS_EXIT = False
IMAGESURL_EXIT=False
ERRORU_EXIT=False
error_num=0
ErrorQ =Queue()#获取错误的url的队列
def main():
CartoonName = input("请输入你想搜素的漫画名:")
if sys.platform.startswith('win'):
      dir_path  ="manhua/"+CartoonName+"/"
else:
      dir_path ="/storage/emulated/0/manhua/"+CartoonName+"/"
try:
      os.mkdir(dir_path)
except Exception as e:
      #print(str(e))
      pass
index = "http://m.gufengmh8.com/"
spider = MSpider(index,CartoonName,dir_path)
spider.get_cookie()
index = spider.search_api()
if index:
      chapters = spider.get_chapter(index)
      spider.get_oneChapter(chapters)
      if not ErrorQ.empty():
         errorTnames = ["error1","error2","error3"]
         eThreads = []
         for tname in errorTnames:
            eThread = ErrorUrlThrad(ErrorQ,tname,spider)
            eThread.start()
            eThreads.append(eThread)
         while not ErrorQ.empty():
            pass
         #等待线程结束
         for t in eThreads:
            t.join()
else:
      print("------------漫画不存在-----------")
      exit(1)
if __name__ == '__main__':
main()

运行环境  Android 和 windows  用的是python3.5  不兼容python2.7

版权声明：

本站所有资源均为站长或网友整理自互联网或站长购买自互联网，站长无法分辨资源版权出自何处，所以不承担任何版权以及其他问题带来的法律责任，如有侵权或者其他问题请联系站长删除！站长QQ754403226 谢谢。
有关影视版权：本站只供百度云网盘资源，版权均属于影片公司所有，请在下载后24小时删除，切勿用于商业用途。本站所有资源信息均从互联网搜索而来，本站不对显示的内容承担责任，如您认为本站页面信息侵犯了您的权益，请附上版权证明邮件告知【754403226@qq.com】，在收到邮件后72小时内删除。本文链接：https://www.piaodoo.com/7419.html

上一篇：自学Python全栈开发的第二次笔记（Python需要注意的地方）

下一篇：【原创】python写的自动投注脚本（SSC）

相关文章

雅思班学费|live的现在分词

幽默的英文形容词|英语四级证书什么样子

大树的英语|英语24个字母表

is的将来时|stop的ing形式

像计算机科学家那样思考 Python中文版第二版

Python精要参考(第二版)_翻译.pdf

《Python标准库》中文版.pdf

Python PEP8 编码规范中文版.pdf

搜索

文章专栏

Python 旋转打印各种矩形的方法

对Python中小整数对象池和大整数对象池的使用详解

python 梯度法求解函数极值的实例

通过PHP与Python代码对比的语法差异详解

python实现函数极小值

Python中面向对象你应该知道的一下知识

python 寻找离散序列极值点的方法

python 绘制拟合曲线并加指定点标识的实现

使用python画社交网络图实例代码

详解Python中的各种转义符\n\r\t

我们为什么要减少Python中循环的使用

Python Matplotlib 基于networkx画关系网络图

Python求离散序列导数的示例

python卸载后再次安装遇到的问题解决

python networkx 包绘制复杂网络关系图的实现

最近发表

孩子喜欢看的奥秘世界百科全书 pdf 阿里有违规联系删除

【鬼吹灯】【盗墓笔记】有声书分享

【网红教辅书】作文金句800例

极客时间-视频课-沈欣-互联网人的数字化企业生存指南（完结）

尚硅谷 MQTT 物联网技术实战

极客时间-专栏课-秦晓辉-运维监控系统实战笔记（完结）

Wireshark网络分析的艺术（林沛满著）

每个家庭都需要的体育教育

轮滑基础教学

GPT图解：大模型是怎样构建的 (黄佳)

标签列表

百度云资源
建筑设计
数控模具
机械工程
菜鸟知道
石油化工
python教程
手游排行
射击游戏
php教程
discuz插件
茂名市高级技工学校
PC game
福利教程
自媒体运营
无毒软件网
高三数学
心得体会
高一语文
Windows软件
织梦教程
工作计划
php源码
织梦模板
discuz模板
java编程
wordpress
android软件
管理
c++教程