本文来源吾爱破解论坛
本帖最后由 null119 于 2019-7-26 01:25 编辑 QQ截图20190725211259.png (176.06 KB, 下载次数: 0)
下载附件
保存到相册
极速截图201907260052.png (256.82 KB, 下载次数: 0)
下载附件
保存到相册
极速截图201907260053.png (279.29 KB, 下载次数: 0)
下载附件
保存到相册
htmltopdf.png (105.18 KB, 下载次数: 0)
下载附件
保存到相册
360截图16491217454771.png (87.51 KB, 下载次数: 0)
下载附件
保存到相册
先上图:
因文章太多,不能转成一个html,要然会卡死,SO,只能按分类转PDF了
代码:
[Python] 纯文本查看 复制代码
import re,json,os,sys,time,requests from lxml import etree from multiprocessing.dummy import Pool as ThreadPool from urllib.parse import quote,unquote def filterFName(FName): rstr = r"[\/\\\:\*\?\"\<\>\|]" new_name = re.sub(rstr, "_", FName) return new_name def mkdir(path): path = path.strip() isExists = os.path.exists(path) if not isExists: os.makedirs(path) def gethtml(url,encode): headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8'} r = requests.get(url,headers=headers) r.encoding = encode return r.text def writehtml(path,str): f = open(path,'w+',encoding='utf-8') f.write(str) f.close def postdata(url,pdata): headers = {'X-Requested-With': 'XMLHttpRequest'} rep = requests.post(url=url, data=pdata, headers=headers) return rep.text def forstr(mstr): mstr=mstr.replace('{', '').replace('}', '').replace(' ', '').replace('"', '') pId = mstr.split(',')[2].split(':')[1] id = mstr.split(',')[0].split(':')[1] name =mstr.split(',')[1].split(':')[1].replace('"','') return id,pId,name def getlv(lname,listtxt,num): global txt,count,zcount gurl = 'https://www.kanxue.com/chm-thread_last_read.htm' id,pId,name=forstr(listtxt) pdata = {"chmid": pId, "cateid": id, "nodename": quote(name, 'utf-8')} repdata = postdata(gurl, pdata) jsonstr = json.loads(repdata) txt = txt + '<h' + str(num) + '>' + name + '</h' + str(num) + '><br>' n = [] for j in zlist: jid,pId1,name1=forstr(j) if pId1==id: n.append(j) if len(n)>0: for k in n: fname = k.split(',')[1].split(':')[1].replace('"', '').replace(' ', '') llname = lname + '->' +fname getlv(llname,k,num+1) if jsonstr['code'] != '-1': for m in range(len(jsonstr['message'])): html=gethtml('https:'+jsonstr['message'][m]['url'],'utf-8') ehtml = etree.HTML(html) try: strs = ehtml.xpath('//*[@class="message break-all"]')[-1] except: pass else: count+=1 zcount+=1 sys.stdout.write('\r'+'此目录已获取:'+str(count)+'篇文章,当前:' +lname+'->'+jsonstr['message'][m]['name']) sys.stdout.flush() strs = etree.tostring(strs, encoding="utf-8", pretty_print=True, method="html").decode("utf-8") strs=re.sub('<h[1,2,3,4,5,6,7,8,9, ]','<b ',strs) strs = re.sub('</h[1,2,3,4,5,6,7,8,9]>', '</b>', strs) strs = re.sub('<img src="upload','<img src="https://bbs.pediy.com/upload',strs) strs = re.sub('<img src="/view', '<img src="https://bbs.pediy.com/view', strs) strs = re.sub('a href="attach-', 'a href="https://bbs.pediy.com/attach-', strs) txt=txt+'<br><br><h'+str(num+1)+'>'+jsonstr['message'][m]['name']+'</h'+str(num+1)+'><br>'+strs def getdata(url): global txt,zlist,count html=gethtml(url,'utf-8') pattern = re.compile('\{ id.*?\}') t=pattern.findall(html) toplist=[] zlist=[] for i in t: if 'pId: 0 'in i: toplist.append(i) else: zlist.append(i) for i in toplist: iid,itid,iname=forstr(i) zzlist = [] for j in range(len(zlist), 0, -1): jid,jpid,jname=forstr(zlist[j-1]) if jpid==iid: zzlist.append(zlist[j-1]) zlist.remove(zlist[j-1]) for j in range(len(zzlist), 0, -1): jid, jpid, jname = forstr(zzlist[j - 1]) mkdir(savepath + filterFName(iname)) dirname=iname+'->'+jname time_start=time.time() count = 0 print('开始获取【'+iname+'->'+jname+'】...') txt = '' getlv(dirname,zzlist[j-1],2) time_end=time.time() print('') print(iname+'->'+jname+':获取完成. 文章数:'+str(count)+',耗时:{:.2f} 秒.'.format(time_end - time_start)) writehtml(savepath + filterFName(iname)+'\\'+filterFName(jname)+'.html', txt) print('*' * 100) if __name__ == '__main__': global zlist,count,zcount count=0 zcount=0 url = 'https://www.kanxue.com/chm.htm' ztime=time.time() savepath='C:\\看雪知识库\\' getdata(url) print('全部任务完成,共获取文章: '+str(zcount)+' ,总耗时:{:.2f} 秒.'.format(time.time()-ztime))
转PDF的代码就不上了,很简单,不会的可以参考我其它的帖子代码
这些资料也上传到电子书屋
地址:https://www.52pojie.cn/forum.php?mod=viewthread&tid=997084
不得不说,资料实在是全,有这一套东西,根本不用再去到处找什么教程了
本帖被以下淘专辑推荐: · 实用工具|主题: 146, 订阅: 52
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。