本文来源吾爱破解论坛
本帖最后由 牵手丶若相惜 于 2020-1-7 14:28 编辑
仅限学习!仅限学习!!仅限学习!!!
————————————————————
爬取的地址:https://hotels.ctrip.com/top/beijing1/zuijinreping-p1
翻两页看看 发现只是最后一个数字在变 不是异步加载 一般都写着代码里
找到代码里的数据 用正则匹配
匹配出来后 写入文件
可视化只做了一个柱状图 和 词云 可能没什么卵用 只是练手
柱状图 只做了前十个 因为那么多个不可能全部画出来
词云是把每个酒店的热评 分词得到的
词云图显示的很慢很慢很慢 词云的高度和宽度越大 显示的越慢 你如果都设置为100也挺快的 就是看不清楚
运行跑红和我一样的不是报错 下面放图
自己看吧
[Python] 纯文本查看 复制代码
import pandas import matplotlib.pyplot as plt import jieba from wordcloud import WordCloud import requests import re headers = { "cookie": 'magicid=ODDiKU0smOx4UPgURZ1vA0U4K8vbcR/FEDtpcCCfq83BaLSQv4yIN4/TI76Mhhde; _RSG=Jh4KiKYyfoAUW2GZ3iWNE8; _RGUID=aca9f225-2c50-4ad3-8397-82c0e340281e; _RDG=28c3e947f0d89520f613eb6277faa7e639; MKT_OrderClick=ASID=4897799752CNLL1ODh6-YCFQ2ZvAod3lQA5g7868292877463892359&AID=4897&CSID=799752&OUID=tongyong19&CT=1578203389740&CURL=https%3A%2F%2Fhotels.ctrip.com%2F%3Fallianceid%3D4897%26sid%3D799752%26ouid%3Dtongyong19%26bd_vid%3D7868292877463892359%26gclid%3DCNLL1ODh6-YCFQ2ZvAod3lQA5g%26gclsrc%3Dds&VAL={"pc_vid":"1578203387303.2a17p2"}; MKT_CKID=1578203389776.qaj18.ohh1; _gcl_dc=GCL.1578203390.CNLL1ODh6-YCFQ2ZvAod3lQA5g; _ga=GA1.2.1528899649.1578203390; MKT_Pagesource=PC; __utma=1.1528899649.1578203390.1578203407.1578203407.1; __utmz=1.1578203407.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); Session=smartlinkcode=U130026&smartlinklanguage=zh&SmartLinkKeyWord=&SmartLinkQuary=&SmartLinkHost=; Union=AllianceID=4897&SID=130026&OUID=&createtime=1578204335&Expires=1578809134634; _abtest_userid=f773ddda-83a3-47f8-8805-853acbdbd2a4; hoteluuid=AS0BC9OgMT1M2Tp1; _HGUID=%01%03%01Y%06RRUMR%03UPMT%01%04SMXSYWMXR%03P%05STPRXQ%05; fcerror=856437406; _zQdjfing=3a923ad5c086275ad0186ad95fa4cc3165bb186ad94ea084275ad0; HotelDomesticVisitedHotels1=5028983=0,0,5,54,/20030g00000086cpe5207.jpg,&37053067=0,0,5,57,/200g15000000xdeyc6E9B.jpg,&345025=0,0,4.7,6077,/200k0n000000ei2w6B6A1.jpg,; _gid=GA1.2.1560812336.1578364461; MKT_CKID_LMT=1578364460851; appFloatCnt=6; _bfs=1.1; _bfa=1.1578203387303.2a17p2.1.1578364458160.1578370337244.3.37; _RF1=117.30.47.199; _jzqco=%7C%7C%7C%7C1578364462902%7C1.1534693112.1578203389772.1578364597203.1578370340341.1578364597203.1578370340341.undefined.0.0.29.29; __zpspc=9.4.1578370340.1578370340.1%232%7Cwww.baidu.com%7C%7C%7C%7C%23; _bfi=p1%3D102085995%26p2%3D102085995%26v1%3D37%26v2%3D36', "referer": "https://hotels.ctrip.com/top/beijing1/zuijinreping", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36" } url = "https://hotels.ctrip.com/top/beijing1/zuijinreping-p{}" # 数据筛选 def data_filtering(data): for i in range(len(data)): # 酒店名称 name = data[i][0] # 地址 address = data[i][1] # 评分和评论人数 ratings_and_comments = data[i][2].split("</span>") # 评分 ratings = ratings_and_comments[0].replace("</b>", "") # 评论人数 Number_of_Commentaries = ratings_and_comments[1].replace("来自", "") # 评论 comments = data[i][3].replace("\n", "").replace("\r", "").replace("\t", "").replace(",", ",") # 价格 price = data[i][4]+"起" with open("北京酒店top.csv", "a", encoding="utf-8") as file: file.write((name+","+address+","+ratings+","+Number_of_Commentaries+","+comments+","+price)+"\n") # 柱状图 def Histogram(df): # 设置字体为仿宋 plt.rcParams["font.sans-serif"] = ["simHei"] # 去重 df = df.drop_duplicates() # 去除前十个价格后面的“起” df["price"][:10] = df["price"][:10].apply(lambda x: int(x.replace("起", ""))) # 价格柱状图 plt.bar(df["name"][:10], df["price"][:10], width=0.3) Number_of_Commentaries = df["Number_of_Commentaries"][:10].apply(lambda x: int(x.replace("位住客点评", ""))) lst = [] for i in range(len(df["name"][:10])): lst.append(i + 0.3) # 评论人数柱状图 plt.bar(lst, Number_of_Commentaries, width=0.3) # 设置图例 plt.legend(["价格", "评价人数"]) # 设置网格 plt.grid() # x轴的标签旋转17度 plt.xticks(rotation=-17) plt.show() return df # 词云图 def word_cloud(df): # 把评论做成词云图 txt = "" for i in df["comments"]: txt += str(i) word = " ".join(jieba.cut(txt)) wc = WordCloud( # 字体 font_path=r"C:\Windows\Fonts\simHei.ttf", # 最多显示的字数 max_words=2000, # 词云宽度 width=10000, # 词云高度 height=10000, ).generate(word) # 关闭x轴和y轴 plt.axis('off') plt.imshow(wc) plt.show() def main(): for i in range(5000): try: response = requests.get(url.format(str(i)), headers=headers).text except: return data = re.findall( 'target="_blank" title="(.*?)" href="/hotel.*?style="color:Black">(.*?)</a></p>.*?target="_blank" ><span><b>(.*?)</a></p>.*?<i class="qot_l"></i>(.*?)<i class="qot_r">.*?<dfn>¥</dfn> (\d+)<span>', response, re.S) if len(data) == 0: print("完成") break print("第", i, "页") data_filtering(data) # 等待3秒 # time.sleep(3) names = ["name", "address", "ratings", "Number_of_Commentaries", "comments", "price"] # 读取爬下来的文件 df = pandas.read_csv("北京酒店top.csv", sep=",", names=names) Histogram(df) word_cloud(df) if __name__ == '__main__': main()
Figure_1.png (470.92 KB, 下载次数: 1)
下载附件 保存到相册
2020-1-7 14:23 上传
词云
捕获4.PNG (37.35 KB, 下载次数: 0)
下载附件 保存到相册
2020-1-7 14:13 上传
在网页中的数据
捕获3.PNG (78.09 KB, 下载次数: 1)
下载附件 保存到相册
2020-1-7 14:24 上传
捕获5.PNG (59.21 KB, 下载次数: 2)
下载附件 保存到相册
2020-1-7 14:24 上传
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。