本文来源吾爱破解论坛
项目简介:
球探中的英超比赛
1. 球队的信息
(球队ID,名字,创建时间,城市,训练场,风格特点,胜率相关)
http://zq.win007.com/cn/team/Summary/19.html
2. 从2013年到2019年所有的比赛
(比赛id, host_id, guest_id, 比赛的信息)
http://zq.win007.com/cn/League/2018-2019/36.html
3. 需要找到每个球员在上面的比赛中的数据
(球员的名字, 比赛id, 球队id, 这个球员在这场比赛中的数据)
点击比分->球员统计
http://bf.win007.com/Count/1552443cn.htm
存到MySQL中
spider页面[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*- import scrapy import re,time from qiutan.items import SaichengItem from qiutan.items import Team_DataItem from qiutan.items import Member_Data_New_Item from qiutan.items import Member_Data_Old_Item class EcSpider(scrapy.Spider): name = 'Ec' allowed_domains = ['zq.win007.com','bf.win007.com'] #将不同年份url交给Scheduler def start_requests(self): re = time.strftime('%Y%m%d%H', time.localtime()) # 2019042509 base_url = 'http://zq.win007.com/jsData/matchResult/{}/s36.js?version={}' date_lis = ['{}-{}'.format(i,i+1) for i in range(2013,2019)] for date in date_lis: req_base = scrapy.Request(base_url.format(date,re), callback = self.parse) req_base.meta['date'] = date req_base.meta['re'] = re yield req_base def team_data_id(self,response): # 获取每个队伍的id和队名 pat = re.compile("\[(\d+),'(.*?)'") ballteam = pat.findall(response.text)[1:] lis_all_team = [] for item in ballteam: lis_all_team.append(item[0]) lis_all_team.append(item[-1]) return lis_all_team #表2 全部轮次的数据表 def parse(self, response): #获取球队id_队名列表 lis_all_team = self.team_data_id(response) #获取每年所有队伍数据 38轮 ball_lunci_team = re.findall('\[(\[\d{3,}.*?\])\];',response.text) num = 0 #根据38轮遍历每一小轮 for eve_turn in ball_lunci_team: #每小页数据 item = SaichengItem() num += 1 # 每轮次的10条数据 eve_turn_team = re.findall('\[\d{6}.*?\]',eve_turn) for eve_turn_team_data in eve_turn_team: #将每行数据转化为list类型 索引取值 #[851543,36,-1,'2013-08-17 19:45',25,58,'1-0','1-0','7', # '13',1.25,0.5,'2.5/3','1',1,1,1,1,0,0,''] lis = eve_turn_team_data.strip('[|]').replace('\'','').split(',') #根据获取的战队id去之前的列表找索引位置 index_num_h = lis_all_team.index(lis[4]) index_num_g = lis_all_team.index(lis[5]) item['lunci'] = num bs_num_id = lis[0] item['bs_time'] = lis[3] #2014-05-04 23:00 <class 'str'> item['bs_num_id'] = bs_num_id item['host_team'] = lis_all_team[index_num_h+1] item['h_team_id'] = lis[4] item['res_score'] = lis[6] item['guest_team'] = lis_all_team[index_num_g+1] item['g_team_id'] = lis[5] item['all_rang'] = self.rangqiu(lis[10]) item['half_rang'] = self.rangqiu(lis[11]) item['sizes_balls_a'] = lis[12] item['sizes_balls_h'] = lis[13] item['half_score'] = lis[7] yield item # 拼接每个比赛详细的url http://bf.win007.com/detail/ 1130517 cn.htm # 2013-08-17 ,2014-5-12 老版页面 判断年份 保存版本 if item['bs_time'] < '2014-05-12 0:00': url = 'http://bf.win007.com/detail/{}cn.htm'.format(bs_num_id) req = scrapy.Request(url, callback=self.bs_data_old) req.meta['bs_num_id'] = bs_num_id req.meta['l_team_id'] = lis[4] req.meta['r_team_id'] = lis[5] yield req else: url = 'http://bf.win007.com/Count/{}cn.htm'.format(bs_num_id) req = scrapy.Request(url, callback=self.bs_data_new) req.meta['bs_num_id'] = bs_num_id req.meta['l_team_id'] = lis[4] req.meta['r_team_id'] = lis[5] yield req team_url = 'http://zq.win007.com/jsData/teamInfo/teamDetail/tdl{}.js?version={}' #根据 偶数索引 取 球队id for i in range(len(lis_all_team)): if i%2 == 0: url = team_url.format(lis_all_team[i],response.meta['re']) req = scrapy.Request(url,callback=self.team_data) #加上防盗链获取接口 req.meta['Referer'] = 'http://zq.win007.com/cn/team/Summary/{}.html'.format(lis_all_team[i]) yield req #每场比赛队员数据: 新版 def bs_data_new(self,response): #实例化Item item = Member_Data_New_Item() #分别 取上下两个队伍的信息 member_lis_tr_s = response.xpath('//div[@id="content"]/div[3]/table//tr[position()>2]') member_lis_tr_x = response.xpath('//div[@id="content"]/div[4]/table//tr[position()>2]') for member_lis in member_lis_tr_s: item['bs_num_id'] = response.meta['bs_num_id'] item['team_id'] = response.meta['l_team_id'] item['member_id'] = member_lis.xpath('./td[1]/text()').extract_first() item['member_name'] = member_lis.xpath('./td[2]/a//text()').extract_first().strip() item['position'] = member_lis.xpath('./td[3]/text()').extract_first().strip() item['shoot_d'] = member_lis.xpath('./td[4]/text()').extract_first() item['shoot_z'] = member_lis.xpath('./td[5]/text()').extract_first() item['key_ball'] = member_lis.xpath('./td[6]/text()').extract_first() item['guoren'] = member_lis.xpath('./td[7]/text()').extract_first() item['chuanq_count'] = member_lis.xpath('./td[8]/text()').extract_first() item['chuanq_succ'] = member_lis.xpath('./td[9]/text()').extract_first() item['passing'] = member_lis.xpath('./td[10]/text()').extract_first() item['hengchuanc'] = member_lis.xpath('./td[11]/text()').extract_first() item['success_zd'] = member_lis.xpath('./td[17]/text()').extract_first() item['body_jc'] = member_lis.xpath('./td[18]/text()').extract_first() item['score'] = member_lis.xpath('./td[30]/text()').extract_first() item['key_event'] = member_lis.xpath('./td[31]/a/img/@title').extract_first() yield item for member_lis in member_lis_tr_x: item['bs_num_id'] = response.meta['bs_num_id'] item['team_id'] = response.meta['r_team_id'] item['member_id'] = member_lis.xpath('./td[1]/text()').extract_first() item['member_name'] = member_lis.xpath('./td[2]/a/text()').extract_first().strip() item['position'] = member_lis.xpath('./td[3]/text()').extract_first().strip() item['shoot_d'] = member_lis.xpath('./td[4]/text()').extract_first() item['shoot_z'] = member_lis.xpath('./td[5]/text()').extract_first() item['key_ball'] = member_lis.xpath('./td[6]/text()').extract_first() item['guoren'] = member_lis.xpath('./td[7]/text()').extract_first() item['chuanq_count'] = member_lis.xpath('./td[8]/text()').extract_first() item['chuanq_succ'] = member_lis.xpath('./td[9]/text()').extract_first() item['passing'] = member_lis.xpath('./td[10]/text()').extract_first() item['hengchuanc'] = member_lis.xpath('./td[11]/text()').extract_first() item['success_zd'] = member_lis.xpath('./td[17]/text()').extract_first() item['body_jc'] = member_lis.xpath('./td[18]/text()').extract_first() item['score'] = member_lis.xpath('./td[30]/text()').extract_first() item['key_event'] = member_lis.xpath('./td[31]/a/img/@title').extract_first() yield item def bs_data_old(self,response): #获取13年左边的阵容数据和后备数据,返回列表[含字符串,] member_lis_l1 = response.xpath("/html/body/table[1]/tr[1]/td[1]/table/tr[3]/td/a//text()").extract() member_lis_l2 = response.xpath("/html/body/table[1]/tr[1]/td[1]/table/tr[5]/td/a/text()").extract() # 获取13年右边的阵容数据和后备数据 member_lis_r1 = response.xpath("/html/body/table[1]/tr[1]/td[3]/table/tr[3]/td/a/text()").extract() member_lis_r2 = response.xpath("/html/body/table[1]/tr[1]/td[3]/table/tr[5]/td/a/text()").extract() item = Member_Data_Old_Item() #将阵容和后备列表合并 member_lis_l = member_lis_l1 + member_lis_l2 member_lis_r = member_lis_r1 + member_lis_r2 # 遍历每个元组(球员号,球员名字) for member in member_lis_l: res = member.strip() member_list = re.findall('(\d+)\s?(.*)', res)[0] #('22', '雅斯科莱宁') ('11', '麦加') item['bs_num_id'] = response.meta['bs_num_id'] item['team_id'] = response.meta['l_team_id'] item['member_id'] = member_list[0] item['member_name'] = member_list[1] yield item for member in member_lis_r: res = member.strip() # 1 切赫 member_list = re.findall('(\d+)\s+(.*)', res)[0] # ('17', '奥布莱恩') item['bs_num_id'] = response.meta['bs_num_id'] item['team_id'] = response.meta['r_team_id'] item['member_id'] = member_list[0] item['member_name'] = member_list[1] yield item #球队信息 def team_data(self,response): #第一行数据 teamDetail = re.findall('var teamDetail = \[(\d+.*)\]',response.text) teamDetail_lis = eval(teamDetail[0]) #获取教练 var_coach =re.findall("var coach = \[\['\d+','','(.*?)','.*','.*',\d\]\];",response.text) item = Team_DataItem() # item['team_id'] = teamDetail_lis[0] item['team_name'] = teamDetail_lis[1] item['Eng_name'] = teamDetail_lis[3] item['team_city'] = teamDetail_lis[5] item['team_home'] = teamDetail_lis[8] item['build_team_time'] = teamDetail_lis[12] try: item['var_coach'] = var_coach[0] except: item['var_coach'] = 'NULL' # 球队特点 item['team_youshi'] = str(re.findall('\[1,\d,"(.*?)\^',response.text)) item['team_ruodian'] = str(re.findall('\[2,\d,"(.*?)\^',response.text)) item['team_style'] = str(re.findall('\[3,\d,"(.*?)\^',response.text)) team_stats_lis = re.findall('var countSum = \[\[(\'.*?)\]',response.text)[0] stats_tuple = eval(team_stats_lis) s = stats_tuple winrate = int(s[2])/(int(s[2])+int(s[3])+int(s[4])) data = (s[2],s[3],s[4],winrate,s[5],s[6],s[7],s[8],s[9],(s[10]),s[11],(s[12]),s[13],s[14],s[24]) str_stats = '全部:胜:%s,平:%s,负:%s,胜率:%.3f,犯规:%s,黄牌:%s,红牌:%s,' \ '控球率:%s,射门(射正):%s(%s),传球(成功):%s(%s),传球成功率:%s,过人次数:%s,评分:%s' item['team_stats'] = str_stats % (data) yield item def rangqiu(self,num_rang): if num_rang == '0': return '平手' elif num_rang == '0.25': return '平/半' elif num_rang == '0.5': return '半球' elif num_rang == '0.75': return '半/一' elif num_rang == '1': return '一球' elif num_rang == '1.25': return '一/球半' elif num_rang == '1.5': return '球半' elif num_rang == '1.75': return '半/二' elif num_rang == '2': return '二球' elif num_rang == '2.25': return '二/半' elif num_rang == '-0.25': return '*平/半' elif num_rang == '-0.5': return '*半球' elif num_rang == '-0.75': return '*半/一' elif num_rang == '-1': return '*一球' elif num_rang == '-1.25': return '*一/球半' elif num_rang == '-1.5': return '*球半' else: return '暂未收录'
items页面
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # https://doc.scrapy.org/en/latest/topics/items.html import scrapy class QiutanItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class SaichengItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() lunci = scrapy.Field() bs_num_id = scrapy.Field() bs_time = scrapy.Field() host_team = scrapy.Field() h_team_id = scrapy.Field() res_score = scrapy.Field() guest_team = scrapy.Field() g_team_id = scrapy.Field() all_rang = scrapy.Field() half_rang = scrapy.Field() sizes_balls_a = scrapy.Field() sizes_balls_h = scrapy.Field() half_score = scrapy.Field() def get_insert_data(self): insert_sql = 'INSERT INTO all_bs_data values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = ( self['lunci'],self['bs_num_id'],self['bs_time'],self['host_team'],self['h_team_id'],self['res_score'],self['guest_team'], self['g_team_id'],self['all_rang'],self['half_rang'],self['sizes_balls_a'],self['sizes_balls_h'],self['half_score']) return insert_sql,data #all_bs_data 建表语句 # CREATE TABLE all_bs_data(id INT PRIMARY KEY AUTO_INCREMENT, # lunci TINYINT, # bs_time VARCHAR(20), # host_team VARCHAR(20), # h_team_id VARCHAR(6), # res_score VARCHAR(10), # guest_team VARCHAR(20), # g_team_id VARCHAR(6), # all_rang VARCHAR(6), # half_rang VARCHAR(6), # sizes_balls_a VARCHAR6), # sizes_balls_h VARCHAR(6), # half_score VARCHAR(6) # )DEFAULT CHARSET=utf8mb4; # alter table all_bs_data add bs_num_id int after lunci; class Team_DataItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() team_id = scrapy.Field() team_name = scrapy.Field() Eng_name = scrapy.Field() team_city = scrapy.Field() team_home = scrapy.Field() build_team_time = scrapy.Field() var_coach = scrapy.Field() team_youshi = scrapy.Field() team_style = scrapy.Field() team_ruodian = scrapy.Field() team_stats = scrapy.Field() def get_insert_data(self): insert_sql = 'INSERT INTO all_team_data(team_id,team_name,Eng_name,team_city,team_home,build_team_time,var_coach,team_youshi,team_style,team_ruodian,team_stats)values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = (self['team_id'],self['team_name'],self['Eng_name'],self['team_city'],self['team_home'],self['build_team_time'], self['var_coach'],self['team_youshi'],self['team_style'],self['team_ruodian'],self['team_stats']) return insert_sql,data # CREATE TABLE all_team_data(id INT PRIMARY KEY AUTO_INCREMENT, # team_id INT, # team_name VARCHAR(20), # Eng_name VARCHAR(30), # team_city VARCHAR(30), # team_home VARCHAR(30), # build_team_time VARCHAR(20), # var_coach VARCHAR(20), # team_youshi VARCHAR(200), # team_style VARCHAR(200), # team_ruodian VARCHAR(200), # team_stats VARCHAR(300) # )DEFAULT CHARSET=utf8mb4; class Member_Data_New_Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() bs_num_id = scrapy.Field() team_id = scrapy.Field() member_id = scrapy.Field() member_name = scrapy.Field() position = scrapy.Field() shoot_d = scrapy.Field() shoot_z = scrapy.Field() key_ball = scrapy.Field() guoren = scrapy.Field() chuanq_count = scrapy.Field() chuanq_succ = scrapy.Field() passing = scrapy.Field() hengchuanc = scrapy.Field() success_zd = scrapy.Field() body_jc = scrapy.Field() score = scrapy.Field() key_event = scrapy.Field() def get_insert_data(self): insert_sql = 'INSERT INTO all_member_data values (null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)' data = (self['bs_num_id'],self['team_id'],self['member_id'],self['member_name'],self['position'],self['shoot_d'], self['shoot_z'],self['key_ball'],self['guoren'],self['chuanq_count'],self['chuanq_succ'],self['passing'] ,self['hengchuanc'],self['success_zd'],self['body_jc'],self['score'],self['key_event']) return insert_sql,data # CREATE TABLE all_member_data(id INT PRIMARY KEY AUTO_INCREMENT, # bs_num_id INT, # team_id INT, # member_id INT, # member_name VARCHAR(30), # position VARCHAR(10), # shoot_d INT, # shoot_z INT, # key_ball INT, # guoren INT, # chuanq_count INT, # chuanq_succ INT, # passing VARCHAR(200), # hengchuanc INT, # success_zd INT, # body_jc INT, # score FLOAT, # key_event VARCHAR(20) # )DEFAULT CHARSET=utf8mb4; class Member_Data_Old_Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() bs_num_id = scrapy.Field() team_id = scrapy.Field() member_id = scrapy.Field() member_name = scrapy.Field() def get_insert_data(self): insert_sql = 'INSERT INTO all_member_data(bs_num_id,team_id,member_id,member_name) values (%s,%s,%s,%s)' data = (self['bs_num_id'],self['team_id'],self['member_id'],self['member_name']) return insert_sql,data
settings页面
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*- # Scrapy settings for qiutan project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # https://doc.scrapy.org/en/latest/topics/settings.html # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html # https://doc.scrapy.org/en/latest/topics/spider-middleware.html BOT_NAME = 'qiutan' SPIDER_MODULES = ['qiutan.spiders'] NEWSPIDER_MODULE = 'qiutan.spiders' #日志等级 LOG_LEVEL='WARNING' # Crawl responsibly by identifying yourself (and your website) on the user-agent # USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs # DOWNLOAD_DELAY = 1 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36', 'Referer':'http://zq.win007.com/cn/TeamHeadPage/2013-2014/36.html' } # Enable or disable spider middlewares # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'qiutan.middlewares.QiutanSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'qiutan.middlewares.QiutanDownloaderMiddleware': 543, #} # Enable or disable extensions # See https://doc.scrapy.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'qiutan.pipelines.QiutanPipeline': 300, 'qiutan.pipelines.MySql_data_Pipeline': 300, } # Enable and configure the AutoThrottle extension (disabled by default) # See https://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
piplines页面
[Python] 纯文本查看 复制代码
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html from qiutan.db_sql import MySql class QiutanPipeline(object): def process_item(self, item, spider): return item class MySql_data_Pipeline(object): def __init__(self): self.db = MySql('localhost','root','123456','second',3306) def process_item(self, item, spider): if hasattr(item,'get_insert_data'): insert_sql,data = item.get_insert_data() self.db.update(insert_sql,data) return item
数据库模块页面
[Python] 纯文本查看 复制代码
import pymysql class MySql: def __init__(self,host,user,password,database,port): self.db = pymysql.connect(host=host,user=user,password=password,database=database, port=port,cursorclass=pymysql.cursors.DictCursor) self.cursor = self.db.cursor() def update(self,sql,data): try: self.cursor.execute(sql, data) self.db.commit() except: self.db.rollback() print('数据修改失败,请检查sql语句~') print(sql,data) def query(self,sql,data): try: result = self.cursor.execute(sql, data) return result except: print('数据查询失败,请查看sql语句~') if __name__ == '__main__': db = MySql('localhost','root','123456','second',3306)
版权声明:
本站所有资源均为站长或网友整理自互联网或站长购买自互联网,站长无法分辨资源版权出自何处,所以不承担任何版权以及其他问题带来的法律责任,如有侵权或者其他问题请联系站长删除!站长QQ754403226 谢谢。