首页 编程教程正文

python获取拉勾网职位信息,通ajax

piaodoo 编程教程 2020-02-22 22:02:39 893 0 python教程

本文来源吾爱破解论坛

本帖最后由 wushaominkk 于 2019-3-14 15:55 编辑

刚刚接触没多久爬虫,希望大家多多指教
代码如下
[Python] 纯文本查看 复制代码

import requests
import re
import json
SEARCH_ID_HEADERS = """
        Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
        Accept-Encoding: gzip, deflate, br
        Accept-Language: zh-CN,zh;q=0.9
        Cache-Control: max-age=0
        Connection: keep-alive
        Host: [url]www.lagou.com[/url]
        Referer: [url]https://www.lagou.com/[/url]
        Upgrade-Insecure-Requests: 1
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
"""
IMG_HEADERS = """
        Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8
        Accept-Encoding: gzip, deflate, br
        Accept-Language: zh-CN,zh;q=0.9
        Cache-Control: max-age=0
        Connection: keep-alive
        Host: a.lagou.com
        Upgrade-Insecure-Requests: 1
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
"""
HEADES = """
        Accept: application/json, text/javascript, */*; q=0.01
        Accept-Encoding: gzip, deflate, br
        Accept-Language: zh-CN,zh;q=0.9
        Connection: keep-alive
        Content-Length: 26
        Content-Type: application/x-www-form-urlencoded; charset=UTF-8
        Host: [url]www.lagou.com[/url]
        Origin: [url]https://www.lagou.com[/url]
        Referer: [url]https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=[/url]
        User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36
        X-Anit-Forge-Code: 0
        X-Anit-Forge-Token: None
        X-Requested-With: XMLHttpRequest
"""
# 把header信息转换成dict
SEARCH_ID_HEADERS = re.findall('(\S+): (\S+)', SEARCH_ID_HEADERS)
SEARCH_ID_HEADERS = dict(SEARCH_ID_HEADERS)
IMG_HEADERS = re.findall('(\S+): (\S+)', IMG_HEADERS)
IMG_HEADERS = dict(IMG_HEADERS)
HEADES = re.findall('(\S+): (\S+)', HEADES)
HEADES = dict(HEADES)
# 创建一个用来保存cookies地方
cookie = {}
# 创建一个session
session = requests.session()
# 获取第一个和第二个cookies信息
img_url_two = "https://a.lagou.com/collect?v=1&_v=j31&a=798985105&t=pageview&_s=1&dl=https%3A%2F%2Fwww.lagou.com%2F&dr=https%3A%2F%2Fwww.lagou.com%2Fjobs%2Flist_python%3FlabelWords%3D%26fromSearch%3Dtrue%26suginput%3D&ul=zh-cn&de=UTF-8&dt=%E6%8B%89%E5%8B%BE%E7%BD%91-%E4%B8%93%E4%B8%9A%E7%9A%84%E4%BA%92%E8%81%94%E7%BD%91%E6%8B%9B%E8%81%98%E5%B9%B3%E5%8F%B0_%E6%89%BE%E5%B7%A5%E4%BD%9C_%E6%8B%9B%E8%81%98_%E4%BA%BA%E6%89%8D%E7%BD%91_%E6%B1%82%E8%81%8C&sd=24-bit&sr=1920x1080&vp=846x921&je=0&_u=MEAAAAQBK~&jid=546309307&cid=1391633655.1547948848&tid=UA-41268416-1&_r=1&z=966384896"
requ = session.get(url=img_url_two, headers=IMG_HEADERS)
cookie.update({
    "user_trace_token": requ.cookies["user_trace_token"],
    "LGRID": requ.cookies["LGRID"],
})
# 获取第三个cookies信息
SEARCH_ID_URL = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
requ = session.get(url=SEARCH_ID_URL, headers=SEARCH_ID_HEADERS)
cookie.update({
    "SEARCH_ID": requ.cookies["SEARCH_ID"],
})
for j in range(0, 20):
    data = {
        "first": "true",
        "pn": str(j + 1),
        "kd": "python",
    }
    Ajax_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"
    requ = session.post(url=Ajax_url, headers=HEADES, data=data, cookies=cookie)
    jsonText = json.loads(requ.content.decode('utf-8'))
    # 请求回来的是一个json
    for i in range(0, len(jsonText['content']['positionResult']['result'])):
        if jsonText["content"] != None:
            print(jsonText['content']['positionResult']['result'][i]['companyFullName'] + "   ------>>>>>   " +
                  jsonText['content']['positionResult']['result'][i]['positionName'] + "    ------>>>>>>" +
                  jsonText['content']['positionResult']['result'][i]['salary'] + "    ------>>>>>>" +
                  jsonText['content']['positionResult']['result'][i]['formatCreateTime'] + "    ------>>>>>>" +
                  jsonText['content']['positionResult']['result'][i]['firstType'])
        else:
            # 有些地方没有这个content
            print("json没有content")

版权声明:

本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。

有关影视版权:本站只供百度云网盘资源,版权均属于影片公司所有,请在下载后24小时删除,切勿用于商业用途。本站所有资源信息均从互联网搜索而来,本站不对显示的内容承担责任,如您认为本站页面信息侵犯了您的权益,请附上版权证明邮件告知【754403226@qq.com】,在收到邮件后72小时内删除。本文链接:https://www.piaodoo.com/7402.html

搜索