本文来源吾爱破解论坛
本帖最后由 wushaominkk 于 2019-3-14 15:55 编辑
刚刚接触没多久爬虫,希望大家多多指教
代码如下
[Python] 纯文本查看 复制代码
import requests import re import json SEARCH_ID_HEADERS = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Host: [url]www.lagou.com[/url] Referer: [url]https://www.lagou.com/[/url] Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36 """ IMG_HEADERS = """ Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Cache-Control: max-age=0 Connection: keep-alive Host: a.lagou.com Upgrade-Insecure-Requests: 1 User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36 """ HEADES = """ Accept: application/json, text/javascript, */*; q=0.01 Accept-Encoding: gzip, deflate, br Accept-Language: zh-CN,zh;q=0.9 Connection: keep-alive Content-Length: 26 Content-Type: application/x-www-form-urlencoded; charset=UTF-8 Host: [url]www.lagou.com[/url] Origin: [url]https://www.lagou.com[/url] Referer: [url]https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=[/url] User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36 X-Anit-Forge-Code: 0 X-Anit-Forge-Token: None X-Requested-With: XMLHttpRequest """ # 把header信息转换成dict SEARCH_ID_HEADERS = re.findall('(\S+): (\S+)', SEARCH_ID_HEADERS) SEARCH_ID_HEADERS = dict(SEARCH_ID_HEADERS) IMG_HEADERS = re.findall('(\S+): (\S+)', IMG_HEADERS) IMG_HEADERS = dict(IMG_HEADERS) HEADES = re.findall('(\S+): (\S+)', HEADES) HEADES = dict(HEADES) # 创建一个用来保存cookies地方 cookie = {} # 创建一个session session = requests.session() # 获取第一个和第二个cookies信息 img_url_two = "https://a.lagou.com/collect?v=1&_v=j31&a=798985105&t=pageview&_s=1&dl=https%3A%2F%2Fwww.lagou.com%2F&dr=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D&ul=zh-cn&de=UTF-8&dt=%E6%8B%89%E5%8B%BE%E7%BD%91-%E4%B8%93%E4%B8%9A%E7%9A%84%E4%BA%92%E8%81%94%E7%BD%91%E6%8B%9B%E8%81%98%E5%B9%B3%E5%8F%B0_%E6%89%BE%E5%B7%A5%E4%BD%9C_%E6%8B%9B%E8%81%98_%E4%BA%BA%E6%89%8D%E7%BD%91_%E6%B1%82%E8%81%8C&sd=24-bit&sr=1920x1080&vp=846x921&je=0&_u=MEAAAAQBK~&jid=546309307&cid=1391633655.1547948848&tid=UA-41268416-1&_r=1&z=966384896" requ = session.get(url=img_url_two, headers=IMG_HEADERS) cookie.update({ "user_trace_token": requ.cookies["user_trace_token"], "LGRID": requ.cookies["LGRID"], }) # 获取第三个cookies信息 SEARCH_ID_URL = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=" requ = session.get(url=SEARCH_ID_URL, headers=SEARCH_ID_HEADERS) cookie.update({ "SEARCH_ID": requ.cookies["SEARCH_ID"], }) for j in range(0, 20): data = { "first": "true", "pn": str(j + 1), "kd": "python", } Ajax_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false" requ = session.post(url=Ajax_url, headers=HEADES, data=data, cookies=cookie) jsonText = json.loads(requ.content.decode('utf-8')) # 请求回来的是一个json for i in range(0, len(jsonText['content']['positionResult']['result'])): if jsonText["content"] != None: print(jsonText['content']['positionResult']['result'][i]['companyFullName'] + " ------>>>>> " + jsonText['content']['positionResult']['result'][i]['positionName'] + " ------>>>>>>" + jsonText['content']['positionResult']['result'][i]['salary'] + " ------>>>>>>" + jsonText['content']['positionResult']['result'][i]['formatCreateTime'] + " ------>>>>>>" + jsonText['content']['positionResult']['result'][i]['firstType']) else: # 有些地方没有这个content print("json没有content")
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。