本文来源吾爱破解论坛
本帖最后由 wushaominkk 于 2020-1-6 09:11 编辑
仅限学习!仅限学习!!仅限学习!!!
————————————————————————
实用性不高 数据处理的一般 只是学习
我是进入二级页面爬取的详细信息 其实在一级页面就能爬取大致的信息 但是一级页面没有公司的具体名字
cookie是我登录后获取的 你们自行更换自己的cookie 我已经把cookie删除掉了
有时爬到第5页会出现 爬不下来 我就加了个判断
如果出现异常 让你输入cookie1 和 cookie2 就手动刷新一下页面获取 然后输入
一级页面是:爬取的页面
二级页面是:职位点进去的详情
cookie1是一级页面的
cookie2是二级页面的
这俩个能不能用同一个 我没去实验 你可以自己试试
爬取一次要更换一次cookie 不然爬不出来
[Python] 纯文本查看 复制代码
import requests import re import time import random from lxml import etree # 详细页面的后缀 search_list = 0 # 页码 cd = 1 headers = { # 填上你自己登录后的cookie "cookie": "", "referer": "https://www.zhipin.com/c101010100/?query=%E7%88%AC%E8%99%AB&page={}&ka=page-{}", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", } headers_1 = { # 填上你自己登录后的详情页的cookie "cookie": "", "referer": "https://www.zhipin.com/c101010100/?query=%E7%88%AC%E8%99%AB&page=3&ka=page-3", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36", } parameters = { "query": "爬虫", "page": "2", "ka": "page-2" } # 匹配详细信息 def detailed_information(response): # gs(公司),cs(城市),gzyq(岗位职责),rs(招聘人数),gz(工资),name(岗位名字),detail(技能要求) try: gs = re.findall('<h3>工商信息</h3>(.*?)<div class="level-list">', response, re.S)[0].replace('<div class="name">',"").replace('</div>',"").replace('\n',"").replace(' ',"") except: gs="null" html = etree.HTML(response) try: cs = html.xpath('//*[@id="macin"]/div[1]/div/div/div[2]/p/text()')[0].replace("\n","").replace(" ","") except: cs="null" try: yq = html.xpath('//*[@id="main"]/div[3]/div/div[2]/div[2]/div[1]/div/text()') except: yq="null" rs = random.randint(1, 10) try: gz = html.xpath('//*[@id="main"]/div[1]/div/div/div[2]/div[2]/span/text()')[0].replace("\n","").replace(" ","") except: gz = "null" try: name = html.xpath('//*[@id="main"]/div[1]/div/div/div[2]/div[2]/h1/text()')[0].replace("\n","").replace(" ","") except: name = "null" if len(cs) == 0: cs="null" if len(gz) == 0: cs="null" if len(gz) == 0: cs = "null" if len(name) == 0: cs = "null" detail = "" yq = "".join(yq) yq = yq.replace("\n", "").replace(" ", "") gzyq = "" if len(yq) == 0: gzyq = "null" detail = "null" else: data = yq.split(":") if len(data) == 1: gzyq = data[0] detail = "null" elif len(data) == 2: gzyq = data[1] detail = "null" elif len(data) == 3: gzyq = data[1][:-4] detail = data[2] else: gzyq = "null" detail = "null" print(gs + "," + cs + "," + gzyq + "," + str(rs) + "," + gz + "," + name + "," + detail) with open("Boss直聘.txt", "a", encoding="utf-8") as file: file.write((gs + "," + cs + "," + gzyq + "," + str(rs) + "," + gz + "," + name + "," + detail)) file.write("\n") def information(urls): global search_list parameters_1 = {"ka": "search_list_"+str(search_list)} headers_1["referer"] = headers["referer"] prefix = "https://www.zhipin.com/" for url in urls: url = prefix+url search_list += 1 response = requests.get(url, headers=headers_1, params=parameters_1).text detailed_information(response) time.sleep(random.randint(2,4)) def position_url(response): try: urls = re.findall(' <a href="(/job_detail/.*?html)"', response, re.S) return urls except: return 0 def main(): url = "https://www.zhipin.com/c101010100/" # cd使用的是全局变量 for i in range(cd, 6): parameters["page"] = str(i) parameters["ka"] = "page-" + str(i) if i < 1: page = str(i - 1) headers["referer"] = headers["referer"].format(page, page) response = requests.get(url, headers=headers, params=parameters).text urls = position_url(response) if len(urls) == 0 and i == 5: print("出现异常") return i print("第", i, "页") time.sleep(random.randint(2, 4)) information(urls) return 0 if __name__ == '__main__': # 写标题 with open("Boss直聘.txt", "w", encoding="utf-8") as file: file.write("公司名称,工作城市,工作要求,招聘人数,工资情况,name(岗位名称),detail") file.write("\n") while True: result = main() if result == 0: print("完成") break else: cd = result headers["cookie"] = input("输入cookie1:") headers_1["cookie"] = input("输入cookie2:") continue
捕获.PNG (745.49 KB, 下载次数: 2)
下载附件 保存到相册
2020-1-5 14:11 上传
爬取出来的数据
捕获1.PNG (130.66 KB, 下载次数: 5)
下载附件 保存到相册
2020-1-5 14:11 上传
爬取的页面
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: 人人影视签到脚本
- 下一篇: 【福利】从零开始,手把手教你python爬取美足船袜网!