本文来源吾爱破解论坛
本帖最后由 xiaohanxxx 于 2019-9-20 09:48 编辑
[Python] 纯文本查看 复制代码
import requests from lxml import etree import time import re import os import threading #9.20修改get_url() def get_url(): # 全书网每个栏目的url for i in range(1,12): url = 'http://www.quanshuwang.com/list/' + str(i) + '_1.html' r = requests.get(url) r.encoding = 'gbk' html = etree.HTML(r.text) lastpage = int(''.join(html.xpath('//a[@class="last"]/text()'))) yield url, lastpage #获取每一页的小说链接 def get_url_page(): dict_url = get_url() for i in dict_url: for j in range(1,i[1]): url = ''.join(re.findall(r'(.*\d_)', i[0])) url = url + str(j) + '.html' #url拼接 r = requests.get(url) r.encoding = 'gbk' html = etree.HTML(r.text) href = html.xpath('//a[@class="clearfix stitle"]/@href') # 每一页的小说url yield href def get_url_page_book(url): def crawl_page(): print(url,"执行第",attempts,"次") r = requests.get(url) r.encoding = 'gbk' html = etree.HTML(r.text) '''出现的问题: 1、请求时可能出现空白页面,实际上是网络原因,需要重新多次请求尝试 ''' href = ''.join(html.xpath('//div[@class="detail"]/a/@href')) # 小说地址 title = ''.join(html.xpath('//h1/text()')) # 小说名 title = re.sub(r"[\\/:*?<>|!\.\"]", '', title) # 修改小说带有非法字符的名称 em = ''.join(html.xpath('//*[@id="waa"]/text()')) # 小说简介 path = 'F:/python/xiaoshuo' # 小说文件夹路径 read_path = path + '/' + title r1 = requests.get(href) # 请求小说的目录页 r1.encoding = 'gbk' html_page = etree.HTML(r1.text) list_href = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/@href') # 章节链接 list_title = html_page.xpath('//div[@class="clearfix dirconone"]/li/a/text()') # 章节标题 # print(list_href,list_title)# 小说的章节标题和链接 if not os.path.exists(read_path):#判断路径是否存在 os.mkdir(read_path) # 创建小说文件夹路径 for j in range(len(list_href)): # 保存小说章节内容 r2 = requests.get(list_href[j]) r2.encoding = 'gbk' html_content = etree.HTML(r2.text) try: content = ''.join(html_content.xpath('//*[@id="content"]/text()')) except AttributeError: print(list_href[j],"读取内容失败") #小说列表字符处理 list_titlee = re.sub(r"[\\/:*?<>|!\"]", '', list_title[j]) list_read_path = read_path + '/' + list_titlee # 写入小说 if not os.path.exists(list_read_path):#判断路径是否存在 os.mkdir(list_read_path) file = open(list_read_path + '/' + 'text.txt', 'w', encoding='utf-8') file.write(content) file.close() #出现解析等问题重试五次 attempts = 1 success = False while attempts < 6 and not success: try: crawl_page() success = True except: print("失败重试...") attempts += 1 if attempts == 6: break if __name__ == '__main__': urls = get_url_page() threads = [] for uu in urls:#对每一页小说进行迭代 for u in uu: t1 = threading.Thread(target=get_url_page_book,args=(u,)) t1.start() time.sleep(1) threads.append(t1) time.sleep(90) for j in threads: j.join()
2.png (71.93 KB, 下载次数: 1)
下载附件 保存到相册
2019-9-19 13:39 上传
小说章节
1.png (70.14 KB, 下载次数: 0)
下载附件 保存到相册
2019-9-19 13:39 上传
全本小说文件夹
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: Python3批量复制视频文件
- 下一篇: pathon小白实践第四天,爬音乐