本文来源吾爱破解论坛
本帖最后由 beisimm 于 2018-10-16 10:05 编辑
之前发的一个爬虫脚本被删了,本着学习的原则再写一个[Python] 纯文本查看 复制代码
import os import requests import re from lxml import etree import threading """ 使用事项: 1.自己替换Url 2.会自动在当前路径下生成一个downloads文件夹,文件存储在此文件夹内 """ class Umeispider(): def __init__(self): # 需要爬取的地址,自己替换 self.base_url = "http://www.umei.cc/bizhitupian/fengjingbizhi/23547.htm" def get_url(self): urls = [] if self.base_url[-7] == '_': url = self.base_url[:-7] + ".htm" # print(url, 7) elif self.base_url[-6] == '_': url = self.base_url[:-6] + ".htm" # print(url, 6) else: url = self.base_url # print(url, 0) urls.append(url) url2 = url[:-4] + '_{}' + ".htm" for Num in range(2, 50): urls.append(url2.format(Num)) # print(urls) return urls def run(self): urls = self.get_url() for url in urls: # print(url) # print('请求地址: ', url) html_str = requests.get(url).content.decode('utf-8') if re.findall(r'404 Not Found', html_str): break self.get_image_url_name(html_str) def create_folder(self, file_name): if not os.path.exists(file_name): os.mkdir(file_name) print('文件夹%s创建成功' % file_name) def get_image_url_name(self, html): eroot = etree.HTML(html) image_url = eroot.xpath('//div/p//img/@src')[0] file_name = eroot.xpath('//strong/text()')[0] threading.Thread(target=self.save_image, args=(image_url, file_name)).start() def save_image(self, image_url, name): print('开始下载:', name) content = requests.get(image_url).content # 这里输入文件夹名 folder_name = 'downloads' self.create_folder(folder_name) path = '%s/%s.jpg' % (folder_name, name) with open(path, 'wb') as f: f.write(content) print('下载完成', name) if __name__ == '__main__': spider = Umeispider() # spider.get_url() spider.run()
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。
- 上一篇: 用python对Azure Blob Storage进行操作
- 下一篇: 【转载】一键自动删除所有微博