本文来源吾爱破解论坛
本帖最后由 ccbynow 于 2018-7-17 15:38 编辑
这里还有更多接口,请移步https://github.com/Jack-Cherish/python-spider[Python] 纯文本查看 复制代码
import requests import re import argparse import sys import json import os parser = argparse.ArgumentParser() parser.add_argument("url", help="Target Url,你所需要文档的URL",type=str) parser.add_argument('type', help="Target Type,你所需要文档的的类型(DOC|PPT|TXT|PDF)",type=str) args = parser.parse_args() url = args.url type = args.type #根据文件决定函数 y = 0 def DOC(url): doc_id = re.findall('view/(.*).html', url)[0] html = requests.get(url).text lists=re.findall('(https.*?0.json.*?)\\\\x22}',html) lenth = (len(lists)//2) NewLists = lists[:lenth] for i in range(len(NewLists)) : NewLists[i] = NewLists[i].replace('\\','') txts=requests.get(NewLists[i]).text txtlists = re.findall('"c":"(.*?)".*?"y":(.*?),',txts) for i in range(0,len(txtlists)): global y print(txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore')) if y != txtlists[i][1]: y = txtlists[i][1] n = '\n' else: n = '' filename = doc_id + '.txt' with open(filename,'a',encoding='utf-8') as f: f.write(n+txtlists[i][0].encode('utf-8').decode('unicode_escape','ignore').replace('\\','')) print("文档保存在"+filename) def PPT(url): doc_id = re.findall('view/(.*).html',url)[0] url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt" html = requests.get(url).text lists=re.findall('{"zoom":"(.*?)","page"',html) for i in range(0,len(lists)): lists[i] = lists[i].replace("\\",'') try: os.mkdir(doc_id) except: pass for i in range(0,len(lists)): img=requests.get(lists[i]).content with open(doc_id+'\img'+str(i)+'.jpg','wb') as m: m.write(img) print("PPT图片保存在" + doc_id +"文件夹") def TXT(url): doc_id = re.findall('view/(.*).html', url)[0] url = "https://wenku.baidu.com/api/doc/getdocinfo?callback=cb&doc_id="+doc_id html = requests.get(url).text md5 = re.findall('"md5sum":"(.*?)"',html)[0] pn = re.findall('"totalPageNum":"(.*?)"',html)[0] rsign = re.findall('"rsign":"(.*?)"',html)[0] NewUrl = 'https://wkretype.bdimg.com/retype/text/'+doc_id+'?rn='+pn+'&type=txt'+md5+'&rsign='+rsign txt = requests.get(NewUrl).text jsons = json.loads(txt) texts=re.findall("'c': '(.*?)',",str(jsons)) print(texts) filename=doc_id+'.txt' with open(filename,'a',encoding='utf-8') as f: for i in range(0,len(texts)): texts[i] = texts[i].replace('\\r','\r') texts[i] = texts[i].replace('\\n','\n') f.write(texts[i]) print("文档保存在" + filename) def PDF(url): doc_id = re.findall('view/(.*).html',url)[0] url = "https://wenku.baidu.com/browse/getbcsurl?doc_id="+doc_id+"&pn=1&rn=99999&type=ppt" html = requests.get(url).text lists=re.findall('{"zoom":"(.*?)","page"',html) for i in range(0,len(lists)): lists[i] = lists[i].replace("\\",'') try: os.mkdir(doc_id) except: pass for i in range(0,len(lists)): img=requests.get(lists[i]).content with open(doc_id+'\img'+str(i)+'.jpg','wb') as m: m.write(img) print("FPD图片保存在" + doc_id + "文件夹") if __name__ == "__main__": try: print(""" ###Athor:52pojie ###TIPS:PDF|PPT只能下载图片 """) eval(type.upper())(url) except: print("获取出错,可能URL错误\n使用格式name.exe url type\n请使用--help查看帮助")本帖被以下淘专辑推荐: · 兄dei,上车吗?|主题: 376, 订阅: 863
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。