本文来源吾爱破解论坛
本帖最后由 ermao 于 2019-6-19 21:46 编辑
问答区看到的,正好练手了,
没有反爬,加个ua就OK了
[Python] 纯文本查看 复制代码
import requests import json import threading import time import os import re paper_id = [] threads = 0 glock = threading.Lock() header = { 'User-Agent': 'DailyApi/4 (Linux; Android 5.1.1; xiaomi 6 Build/xiaomi/xiaomi 6/x86/LMY48Z/zh_CN) Google-HTTP-Java-Client/1.22.0 (gzip) Google-HTTP-Java-Client/1.22.0 (gzip)' } path = '' def get_single_paper(paper_id): global threads glock.acquire() threads += 1 glock.release() url = 'https://news-at.zhihu.com/api/4/story/' + paper_id res = requests.get(url, headers=header) resJson = json.loads(res.content.decode('utf-8')) try: title = resJson['title'] body = resJson['body'] # css = resJson['css'] # css文件加了没什么效果,不加了 # allcss = '' # for csss in css: # allcss = allcss + '<link href="' + css + '"/>' except TypeError: print('json读取失败') else: f = open('./' + path + '/' + paper_id + '-' + re.sub(r'[\\/:\*\?"<>\|]', '', title) + '.html', 'w', encoding='utf-8') # 文件名为id+标题.html f.write(body) # 各个文件独立,不需要加锁 f.close() glock.acquire() threads -= 1 glock.release() return def get_papers(id, timestamp_end): global path timestamp = 0 if id == '35': path = '小事' elif id == '2': path = '瞎扯-吐槽' if not os.path.exists(path): os.makedirs(path) while True: if timestamp > timestamp_end: url = 'https://news-at.zhihu.com/api/4/section/' + id + '/before/' res = requests.get(url + str(timestamp), headers=header) elif timestamp == 0: url = 'https://news-at.zhihu.com/api/4/section/' + id res = requests.get(url, headers=header) else: print('已到日期上限') break print(res.content.decode('utf-8')) resJson = json.loads(res.content.decode('utf-8')) timestamp = resJson['timestamp'] # n = len(resJson['stories']) # for i in range(0, n): for storie in resJson['stories']: paper_id.append(storie['id']) return len(paper_id) def thread_control(N): for iii in paper_id: p_id = str(iii) t = threading.Thread(target=get_single_paper, args=(p_id, )) t.start() # print('当前线程数:{:^5}'.format(threads)) while threads >= N: time.sleep(0.2) t.join() print('已完成') return # 35 小事 # 2 瞎扯·如何正确的吐槽 # 1490536800 20170306之前的数据 # 1553608800 20190306之前的数据 nums = get_papers('2', 1490536800) # 第一个参数是文章类别id,第二个参数是时间上限 print('ID采集完成,共' + str(nums) + '个,开始下载') thread_control(20) # 线程数 # get_single_paper('9712276')
截图:
22.png (102.4 KB, 下载次数: 0)
下载附件 保存到相册
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。