首页 编程教程正文

twitter用户图片批量下载

piaodoo 编程教程 2020-02-22 22:11:46 1214 0 python教程

本文来源吾爱破解论坛

本帖最后由 arctanh 于 2018-12-1 20:22 编辑

git上找了一圈没找到,自己胡乱实现了下,分享给有需要的朋友。过墙代{过}{滤}理用的酸酸乳本地http代{过}{滤}理
[Python] 纯文本查看 复制代码

#! python3
# -*-coding:utf-8-*-
# 只下载原创推文图片 2018.11.29
import os
import re
import json
import requests
import random
import time
from six.moves import queue as Queue
from threading import Thread

THREADS_NUM = 10            #多线程数量 
PROXIES={'https': 'https://127.0.0.1:55555','http': 'http://127.0.0.1:55555'}
DEFSAVEDIR='E:/twitter/'    #默认保存目录
HEADERS_LIST = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; x64; fr; rv:1.9.2.13) Gecko/20101203 Firebird/3.6.13',
    'Mozilla/5.0 (compatible, MSIE 11, Windows NT 6.3; Trident/7.0; rv:11.0) like Gecko',
    'Mozilla/5.0 (Windows; U; Windows NT 6.1; rv:2.2) Gecko/20110201',
    'Opera/9.80 (X11; Linux i686; Ubuntu/14.10) Presto/2.12.388 Version/12.16',
    'Mozilla/5.0 (Windows NT 5.2; RW; rv:7.0a1) Gecko/20091211 SeaMonkey/9.23a1pre'
]
HEADERS = {'User-Agent': random.choice(HEADERS_LIST)}   

class DownloadWorker(Thread):
    def __init__(self, queue, headers, proxies):
        Thread.__init__(self)
        self.queue = queue
        self.headers = headers
        self.proxies = proxies   

    def run(self):
        while True:
            picurl, savepath = self.queue.get()
            self.downloadfile(picurl, savepath)
            self.queue.task_done()

    def downloadfile(self, picurl, savepath):       
        with requests.get(url = picurl, headers = self.headers, proxies = self.proxies) as respon:
            with open(savepath, 'wb') as f:            
                print('downloading-->:'+picurl)                
                f.write(respon.content)        

class TwitterPicScraper(object):
    #构造函数初始化
    def __init__(self, user, headers, proxies):
        self.user = user        
        self.headers = headers
        self.proxies = proxies
        self.userdir = os.path.join(DEFSAVEDIR, user)
        self.queue = Queue.Queue()
        self.scheduling()

    def scheduling(self):        
        for i in range(THREADS_NUM):
            worker = DownloadWorker(queue = self.queue, headers = self.headers , proxies = self.proxies)            
            worker.daemon = True
            worker.start()        
        self._getpic_list()
        #等待队列为空再返回主线程
        self.queue.join()
        print("用户[ %s ]的图片下载完成" %self.user)
    #获取上次保存于文件中的下载位置
    def getpos(self):     
        pospath =  os.path.join(self.userdir, 'pos.txt')
        with open(pospath, 'r') as f:
            savedpos ,savedpicname = f.read().splitlines(False)          
        return savedpos, savedpicname
    #设置最新下载位置
    def setpos(self, savedpos, savedpicname):
        pospath = os.path.join(self.userdir, 'pos.txt')                      
        with open(pospath,'w') as f:                         
            f.writelines(savedpos + '\n' + savedpicname)    
    #获取图片url列表并加入队列
    def _getpic_list(self):
        #baseurl用于获得第一页推文图片        
        baseurl = 'https://twitter.com/i/profiles/show/{user}/media_timeline?for_photo_rail=true'
        #apiurl用于参数递推获取,相当于下滚翻页
        apiurl = 'https://twitter.com/i/profiles/show/{user}/timeline/tweets?include_available_features=1\
                  &include_entities=1&max_position={pos}&reset_error_state=false'
        re_itemid = r'data-item-id="(.+?)"'
        re_photo = r'data-image-url="(.+?)"'       
        starturl = baseurl.format(user = self.user)
        #该用户首次下载
        if not os.path.exists(self.userdir): 
            os.makedirs(self.userdir)
            savedpos = "0"
            savedpicname = "000000.jpg"
        #更新下载
        else:  
            savedpos ,savedpicname = self.getpos()
        #首页get        
        retjson=requests.get(url = starturl, headers = self.headers, proxies = self.proxies).json()
        list_itempos = re.findall(re_itemid, retjson['items_html'])    
        list_picurl = re.findall(re_photo, retjson['items_html'])
        #获取最新下载位置,用于判重
        str_newpos = str_startpos = list_itempos[0]
        #图片列表list_picurl可能为空,最新图片名先赋以前保存值
        str_newpicname = savedpicname      
        str_nextapipos = list_itempos[-1]    
        if (str_startpos <= savedpos):
            print("用户[ %s ]无最新tweets" %self.user)        
        else:
            b_newpic_exists = False            
            while(retjson['new_latent_count'] > 0):
                #获取最新图片名,避免重复下载
                if (b_newpic_exists == False):
                    if (list_picurl != []):
                        str_newpicname = list_picurl[0][28:]
                        b_newpic_exists = True
                #保存的下载位置id在本次get返回列表数据中,但可能图片列表为空
                if (str_nextapipos <= savedpos):
                    if (list_picurl != []):
                        #不为空则在列表中,取出下标
                        i = list_picurl.index('https://pbs.twimg.com/media/'+savedpicname)   #忘加上'https://pbs.twimg.com/media/' (2018.12.01编辑 )               
                        for picurl in list_picurl[:i]:
                            #len('https://pbs.twimg.com/media/')=28
                            picpath = os.path.join(self.userdir, picurl[28:])  
                            self.queue.put((picurl, picpath))
                    break                
                else:
                    if (list_picurl != []):
                        for picurl in list_picurl:                            
                            picpath = os.path.join(self.userdir, picurl[28:])  
                            self.queue.put((picurl, picpath)) 
                    #继续get             
                    nextapiurl = apiurl.format(user = self.user, pos = str_nextapipos)                   
                    r = requests.get(url = nextapiurl, headers = self.headers, proxies = self.proxies)                   
                    print("get-->statuscode = %d" %r.status_code)    #for debug
                    retjson=r.json()
                    #主线程get太快可能返回404
                    time.sleep(1)                       
                    list_picurl = re.findall(re_photo, retjson['items_html'])
                    list_itempos = re.findall(re_itemid, retjson['items_html'])
                    str_nextapipos = retjson['min_position']
            #保存最新下载位置
            self.setpos(str_newpos, str_newpicname)     

if __name__ == "__main__":    
    if not os.path.exists(DEFSAVEDIR):
        os.makedirs(DEFSAVEDIR)    
    while(True):
        username=input('请输入twitter用户名(q退出):')          
        if username=='q':
            break        
        else:   
           TwitterPicScraper(user = username, headers=HEADERS, proxies=PROXIES)

版权声明:

本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。

有关影视版权:本站只供百度云网盘资源,版权均属于影片公司所有,请在下载后24小时删除,切勿用于商业用途。本站所有资源信息均从互联网搜索而来,本站不对显示的内容承担责任,如您认为本站页面信息侵犯了您的权益,请附上版权证明邮件告知【754403226@qq.com】,在收到邮件后72小时内删除。本文链接:https://www.piaodoo.com/7792.html

搜索