首页 编程教程正文

爬取哔哩哔哩弹幕并绘制折线图

piaodoo 编程教程 2020-02-22 22:14:53 1306 0 python教程

本文来源吾爱破解论坛

本帖最后由 麦田孤望者 于 2020-1-28 18:45 编辑

这几天实在是太闲了...于是码了一个爬弹幕的程序
用的第三方库有: requests , pysimplegui , matplotlib
版本:一定是3.8 因为我有第42行用了海象操作符...当然那里改成return (cid_tuple,part,re.findall(re.compile('aid=\d{8}',re.S),url)[0][-8:])应该也是没问题的

生成效果

Figure_5.png (130.07 KB, 下载次数: 0)

下载附件  保存到相册

2020-1-28 18:44 上传


Figure_4.png (129.92 KB, 下载次数: 0)

下载附件  保存到相册

2020-1-28 18:45 上传


上代码
[Python] 纯文本查看 复制代码
import json
import os
import re
import sys
import time
from fnmatch import fnmatch

import matplotlib.pyplot as plt
import PySimpleGUI as sg
import requests
from pylab import mpl

mpl.rcParams['font.sans-serif'] = ['SimHei']


def findall(tx1,tx2,tx3):
    return re.findall(re.compile('{}(.*?){}'.format(tx1,tx2),re.S),tx3)
def parse_1(url):
    try:
        aid = findall('https://www.bilibili.com/video/av','?spm_i',url)[0][:-1]
    except IndexError:
        aid = url[-8:]
    aid = re.findall(re.compile('\d{8}',re.S),aid)[0]
    #print('aid:',aid)
    res = requests.get('https://api.bilibili.com/x/player/pagelist?aid={}&jsonp=jsonp'.format(aid))
    cid = res.json()['data'][0]['cid']
    #print('cid:',cid)
    return (cid)
    
def parse_2(url):
    
    
    cid_tuple = []
    part=[]
    res = requests.get(url)
    html = findall('window.__kanzaki_ranko=','</script>',res.text)[0]
    cid_list = json.loads(html)['main']['pages']
    #rint(cid_list)
    for i in cid_list:
        cid_tuple.append(i['cid'])
        part.append(i['part'])
    return (cid_tuple,part,aid:=re.findall(re.compile('aid=\d{8}',re.S),url)[0][-8:])

def parse_3(url):
    cid_list=[]
    title_list=[]
    res = requests.get(url).text
    html=findall('.__INITIAL_STATE__=',';\(function\(\)',res)[0]
    a = json.loads(html)['epList']
    for i in a:
        cid_list.append(i['cid'])
        title_list.append(i['titleFormat']+' '+i['longTitle'])
    return (cid_list,title_list,'')

def check(url,mode=''):
    
    if fnmatch(url,'https://www.bilibili.com/*') == False:
        print('请输入正确视频地址!')
        sys.exit()
    if fnmatch(url,'https://www.bilibili.com/video/*') == True:
        #print('1')
        return parse_1(url)
    elif fnmatch(url,'https://www.bilibili.com/blackboard/*') == True:
        #print('2')
        return parse_2(url)
    elif fnmatch(url,'https://www.bilibili.com/bangumi/*') == True:
        #print('3')
        return parse_3(url)
    

def get_comment(cid):
    list1=[]
    list2=[]
    #print(cid)
    res = requests.get('http://comment.bilibili.com/{}.xml'.format(cid))
    res.encoding='utf-8'
    comment_list = findall('<d','</d>',res.text)
   
    for i in comment_list:
        #print(i)
        params = findall('p="','">',i)[0]
        value = re.sub(params+'">','',i)[4:]
        params = params.split(',')
        list1.append(params)
        list2.append(value)
    a=(list1,list2)

    return(a)
    
def check_again(tup,url):

    layout=[]
    if str(type(tup))=="<class 'tuple'>":
        cid=tup[0]
        for i,v in enumerate(tup[1]):
            layout.append([sg.Button(v,font='宋体.ttf')])
            
        window2 = sg.Window('请确认第几话',layout=layout)
        a,b = window2.Read()
        
        for q,r in enumerate(tup[1]):
            if a == r:
                break

        anw = q
        window2.close()
        if int(anw) != i+1:
            cid = [str(tup[0][int(anw)])]
        
        title=str(tup[1][int(anw)])
       
        
    else:
        res = requests.get(url)
        html=findall('<div id="viewbox_report" class="video-info report-wrap-module report-scroll-module">','class="video-title">',res.text)[0]
        title=findall('title="','"',html)[0]
        cid=[tup]
    return (cid,title)

def main():
    global video_url
    tx1 = sg.Text('视频链接:',font='宋体.ttf')
    txt = sg.Input('')
    bt1 = sg.Button('确定',font='宋体.ttf')
    window = sg.Window('哔哩哔哩弹幕',layout=[[tx1,txt,bt1]])
    a,b = window.Read()

    
    video_url = b[0]
    window.close()
    cid = check_again(check(video_url),video_url)
    for i in cid[0]:
        a= get_comment(int(i))
        
    title=cid[1]
    
    return (a,title)
def bubbleSort(arr):
    n = len(arr)
 
    
    for i in range(n):
 
       
        for j in range(0, n-i-1):
           
 
            if arr[j][0] > arr[j+1][0] :
                arr[j], arr[j+1] = arr[j+1], arr[j]
    return arr

 


def draw_():
    mai=main()
    x,y=mai[0][0],mai[0][1]
    title=mai[1]
    list1=[]
    list2=[]
    for rr,i in enumerate(x):
        list1.append((int(float(i[0])),y[rr]))
    list2=bubbleSort(list1)
    
    max_time=list2[-1:][0][0]
    len_lis=len(list2)
    if max_time <=500:
        e = 10
    elif max_time >500:
        e=30
    elif max_time >1000:
        e=120
    elif max_time >3000:
        e=300
    a = max_time//e
    b = max_time%e
    c = a+1
   
    list3=[]
    list4=[]
    d=0
    for i in range(a):
        lambda_=[]
        for ii in list2:
            if i*e<ii[0]<=i*e+e:
                lambda_.append(ii)
        list3.append(len(lambda_))
        list4.append(i*e)
        d+=len(lambda_)
    
    list3.append(len_lis-d)
    list4.append(max_time)

  
    x = range(len(list4))
    plt.figure(figsize=(len(list4)-6,6.5))
    plt.plot(x, list3, marker='o', mec='r', mfc='w',label=u'弹幕数量曲线图')

    plt.legend() # 让图例生效
    plt.xticks(x, list4, rotation=20)
    plt.margins(0)
    plt.subplots_adjust(bottom=0.15)
    plt.xlabel(u"时间/s ") #X轴标签
    plt.ylabel("数量") #Y轴标签
    plt.title(title) #标题
 
    plt.show()
    
if __name__ == '__main__':
    try:
        draw_()
    except BaseException as e:
        print('ERROR:',e)


不写注释真的不是好习惯...还有...不要在意我的变量名...没想到合适的就用的abcde

看不懂的可以评论区问我
还有...求评分

Figure_5.png (130.07 KB, 下载次数: 0)

下载附件  保存到相册

2020-1-28 18:44 上传

Figure_5.png

版权声明:

本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。

有关影视版权:本站只供百度云网盘资源,版权均属于影片公司所有,请在下载后24小时删除,切勿用于商业用途。本站所有资源信息均从互联网搜索而来,本站不对显示的内容承担责任,如您认为本站页面信息侵犯了您的权益,请附上版权证明邮件告知【754403226@qq.com】,在收到邮件后72小时内删除。本文链接:https://www.piaodoo.com/7936.html

搜索