本文来源吾爱破解论坛
拿去玩,python简书搜索查询采集工具附py源码
拿去玩,python简书搜索查询采集工具附py源码
[Python] 纯文本查看 复制代码
#简书网采集 import requests from bs4 import BeautifulSoup import re import os import json import urllib.parse def hqnr(url): headers={ 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36' } #url="https://www.jianshu.com/p/4bb6de4d9588" html=requests.get(url,headers=headers).text print(html) soup=BeautifulSoup(html,'lxml') #获取标题 print(soup) h=soup.h1.text h=re.sub(r'[\|\/\<\>\:\*\?\\\"]', "_", h) # 剔除不合法字符 print(h) os.makedirs(f'./jianshu/{h}/',exist_ok=True) #获取作者 author=soup.select('a[class="title"]') author=author[0].text author='%s%s%s%s'%("\n","<p>来源:", author,"</p>") print(author) #获取源码 ym=soup.find(class_="show-content-free") ym='%s%s'%(ym,author) print(ym) with open(f'./jianshu/{h}/{h}.txt','w',encoding='utf-8') as f: f.write(ym) #获取图片 img=soup.find(class_="show-content-free").find_all("img") for img_url in img: img_url=img_url['data-original-src'] #print(img_url) img_url=f'https:{img_url}' img_name=img_url[-10:] if not ("jpeg" in img_name): if not ("png" in img_name): if not("jpg" in img_name): img_name = f'{img_name}.jpg' print(img_url) print(img_name) try: r=requests.get(img_url,headers=headers) with open(f'./jianshu/{h}/{img_name}','wb') as f: f.write(r.content) print(f"保存{img_name}图片成功!") r = requests.get(img_url) except: print(f"保存{img_name}图片失败!") print(img_url) pass print(f"保存所有 {h} 图片成功!") def hqlj(keywords,n): headers={ 'cookie': '__yadk_uid=Y3mKqaNc5fm6TD1MEDd3JM1tU1wEwW3G; read_mode=day; default_font=font2; locale=zh-CN; Hm_lvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1558517170,1559028759,1559108589; sensorsdata2015jssdkcross=%7B%22distinct_id%22%3A%2216adedcc00243c-0b4fae0bf62d43-43450521-1296000-16adedcc005188%22%2C%22%24device_id%22%3A%2216adedcc00243c-0b4fae0bf62d43-43450521-1296000-16adedcc005188%22%2C%22props%22%3A%7B%22%24latest_traffic_source_type%22%3A%22%E8%87%AA%E7%84%B6%E6%90%9C%E7%B4%A2%E6%B5%81%E9%87%8F%22%2C%22%24latest_referrer%22%3A%22https%3A%2F%2Fwww.baidu.com%2Flink%22%2C%22%24latest_referrer_host%22%3A%22www.baidu.com%22%2C%22%24latest_search_keyword%22%3A%22%E6%9C%AA%E5%8F%96%E5%88%B0%E5%80%BC%22%7D%7D; _m7e_session_core=16b43838d0bb0a1578f9c40f429eb8c4; Hm_lpvt_0c0e9d9b1e7d617b3e6842e85b9fb068=1559108983; signin_redirect=https%3A%2F%2Fwww.jianshu.com%2Fsearch%3Fq%3D%25E4%25BA%25A7%25E5%2593%2581%25E8%25AE%25BE%25E8%25AE%25A1%26page%3D1%26type%3Dnote', 'user-agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36', 'x-csrf-token': 'm7+AgIi/4xUG6ClHPxPUdEc/Ym68P0qNIeWVKPkT++Bpxx/iALxsDm6HWlQWlO0oSMba/RKJvJ5+akVg9BvGvQ==', } for x in range(1,n+1): url=f"https://www.jianshu.com/search/do?q={keywords}&type=note&page={x}&order_by=default" html=requests.post(url,headers=headers) print(html) html=json.loads(html.text) print(html) print(html['entries']) con=html['entries'] for nr in con: lj=nr['slug'] lj='%s%s'%("https://www.jianshu.com/p/",lj) print(lj) hqnr(lj) if __name__ == '__main__': keywords=input("请输入要搜索的内容:",) n= input("请输入页码:", ) n=int(n) keywords = urllib.parse.quote(keywords) print(keywords) hqlj(keywords, n)
简书1.png (136.39 KB, 下载次数: 1)
下载附件 保存到相册
简书采集运行.gif (730.57 KB, 下载次数: 0)
下载附件 保存到相册
运行2.png (131.34 KB, 下载次数: 1)
下载附件 保存到相册
附exe地址:链接: https://pan.baidu.com/s/1v4u7ao5w1_diTmkR4blQcQ 提取码: eg7j
使用说明:
右键 编辑
第一次 需要 输入要搜索的关键词 比如 python
第二次 需要输入要采集的页码数 比如 10
大家可以自行尝试!
exe运行环境 win7 64位
采集个别情况还是存在bug,玩玩就好! 本帖被以下淘专辑推荐: · 鱼木收集|主题: 2120, 订阅: 1667
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: pak文件解析代码
- 下一篇: 使用python获取X信群聊信息并进行分析