本文来源吾爱破解论坛
本帖最后由 qq58452077 于 2019-8-11 14:10 编辑
#!/usr/bin/env python
# encoding: utf-8
"""
@version: 1.0
@author: CJ
@software: PyCharm
@file: 52guzhuang.py
@time: 2017/7/13 23:01
"""
import urllib.request
import lxml.html
import time
import os
import random
import re
def serchIndex(url='http://www.52guzhuang.com/'):
req = urllib.request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
html = urllib.request.urlopen(req).read().decode('GBK')
return html
def findPageTotal(html,isNext):
tree = lxml.html.fromstring(html)
eList = tree.cssselect('div#postlist > div')
dicts = {}
page = ''
for i in range(0,len(eList)-1):
links = []
if i == 0:
if not links:
links = eList[i].cssselect('#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f >div:nth-last-of-type(1) > font > font > font > ignore_js_op')
if not links:
links = eList[i].cssselect('#jiathis_share_CODE_HTML4 > div.t_fsz >table > tr:nth-child(1) >td.t_f > ignore_js_op')
if not links:
links = eList[i].cssselect('table.plhin > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tr:nth-child(1) >td.t_f > div > ignore_js_op')
else:
links = eList[i].cssselect('table.plhin > tr:nth-child(1) > td.plc > div.pct > div.pcb > div.t_fsz > table > tr:nth-child(1) >td.t_f > div > ignore_js_op')
for index,link in enumerate(links):
src = link[index].get('zoomfile')
name = link[index].get('aid')
dicts[name] = "http://www.52guzhuang.com/"+src;
if isNext:
ele = tree.cssselect("div#ct > div.pgs.mtm.mbm.cl > div.pg > a:nth-last-child(3)");
if ele and len(ele) != 0:
page = ele[0].text
return dicts,page
def dowmloadImage(image_url,filename):
for i in range(len(image_url)):
try:
req = urllib.request.Request(image_url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.116 Safari/537.36')
image_data = urllib.request.urlopen(req).read()
except (urllib.HTTPError, urllib.URLError) as e:
time.sleep(0.1)
continue
open(filename,'wb').write(image_data)
break
def mkdirByGallery(path):
# 去除首位空格
path = path.strip()
# path = '/photo/'+path
path = 'E:\\py\\photo\\' + path
#这两个函数之间最大的区别是当父目录不存在的时候os.mkdir(path)
#不会创建,os.makedirs(path)
#则会创建父目录。
isExists = os.path.exists(path)
if not isExists:
os.makedirs(path)
return path
if __name__ == '__main__':
url = input("请输入爬取网站:")
path = mkdirByGallery("52guzhuang")
html = serchIndex(url)
dicts = {}
image_dict,page= findPageTotal(html,True)
dicts.update(image_dict)
if page:
for i in range(2,int(page)+1):
#html= serchIndex(url+"&page="+str(i))
html= serchIndex(url[0:-9]+"-"+str(i)+"-1.html")
image_dict,page = findPageTotal(html,False)
dicts.update(image_dict)
for k,v in dicts.items():
dowmloadImage(v,path +"/"+str(k)+".jpg")
使用教程:
1.需要安装lxml第三方库
2.数据来源:52古装网
喜欢汉服和汉元素的朋友欢迎来评分(喜欢汉服小姐姐也欢迎来评分)!!
##此代码仅供学习和查考
汉服小姐姐.png (269.75 KB, 下载次数: 4)
下载附件 保存到相册
2019-8-11 14:04 上传
汉服小姐姐
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。