python爬取“大乐透”往期数据
网页分析
构造请求头,获取soup
#只是获取单页的内容
import requests
import re
import pandas
from bs4 import BeautifulSoup
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
'Host':'www.caibow.com'
}
url = 'https://www.caibow.com/dlt/kj/p1/'
res = requests.get(url,headers=headers,timeout = 10)
soup = BeautifulSoup(res.text,'html.parser')
获取期数标题
#取期数
for spans in soup.select('.pd_10_20.bb_das '):
#print(spans)
for dateNum in spans.select('.fl.fz_16'):
#print(dateNum.text)
#print(type(dateNum.text))
num=re.findall(r"\d+",dateNum.text)
if (len(num)):
print(num[0])
print('----------------------------------------')
结果:
取日期
#取日期
date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?')
for spans in soup.select('.pd_10_20.bb_das '):
#print(spans)
for dateNum2 in spans.select('.fr.fz_14'):
dates = re.findall(date_re,str(dateNum2))
#print(type(dateNum2.text))
if(dates):
print(dates[0])
print('----------------------------------------')
取奖池滚存
#取奖池滚存
#取亿元 /元
#数据的单位不统一,用不同的正则匹配
mnY_re = re.compile('.*?(\d+\.\d+\w+).*?')
mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?')
for spans in soup.select('.pd_10_20.bb_das '):
for money in spans.select('.fr.fz_14.lh_30'):
#print(money)
mone1 = re.findall(mn_re,str(money))
mone2 = re.findall(mnY_re,str(money))
if(mone1):
print(mone1[0])
elif(mone2):
print(mone2[0])
else:
print('null')
print('----------------------------------------')
取红篮球号
#用列表字典来保存结果,方便后面制表
info = {}
d = []
#取红球蓝球
#soup_list_red = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
#soup_list_blue = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?')
pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?')
for spans in soup.select('.pd_10_20.bb_das '):
#print(spans)
info={}
soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
items = re.findall(pattern_red,str(soup_list_red))
items_blue = re.findall(pattern_blue,str(soup_list_blue))
items = re.findall(pattern_red,str(soup_list_red))
items_blue = re.findall(pattern_blue,str(soup_list_blue))
print(items)
print(items_blue)
red_ball = ','.join(items)
blue_ball = ','.join(items_blue)
print(red_ball)
print(blue_ball)
info['红球']=red_ball
info['蓝球']=blue_ball
d.append(info)
print('----------------------------------------')
完整代码
#coding:utf-8
'''
Created on 2019年12月21日
@author: liu yan
'''
import requests
import re
import pandas
from bs4 import BeautifulSoup
headers = {
'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
'Host':'www.caibow.com'
}
#url = 'https://www.caibow.com/dlt/kj/p1/'
#res = requests.get(url,headers=headers,timeout = 10)
#soup = BeautifulSoup(res.text,'html.parser')
def getsoup(url):
#打开链接
res = requests.get(url,headers=headers,timeout = 10)
soup = BeautifulSoup(res.text,'html.parser')
return soup
def get_one_data(spans):
info = {}
#取期数
for dateNum in spans.select('.fl.fz_16'):
#print(dateNum.text)
#print(type(dateNum.text))
num=re.findall(r"\d+",dateNum.text)
if (len(num)):
info['大乐透期数']=num[0]
#取日期
date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?')
for dateNum2 in spans.select('.fr.fz_14'):
dates = re.findall(date_re,str(dateNum2))
#print(dateNum2.text)
#print(type(dateNum2.text))
if(dates):
info['日期']=dates[0]
#取奖池滚存
mnY_re = re.compile('.*?(\d+\.\d+\w+).*?')
mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?')
mn_re2 = re.compile('.*?(\d+\,\d+\,\d+\w+).*?')
for money in spans.select('.fr.fz_14.lh_30'):
#print(money)
mone1 = re.findall(mn_re,str(money))
mone2 = re.findall(mnY_re,str(money))
mone3 = re.findall(mn_re2,str(money))
if(mone1):
info['奖池滚存(元)']=mone1[0]
elif(mone2):
info['奖池滚存(元)']=mone2[0]
elif(mone3):
info['奖池滚存(元)']=mone3[0]
else:
info['奖池滚存(元)']='null'
#取球号
pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?')
pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?')
#print(spans)
soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
items = re.findall(pattern_red,str(soup_list_red))
items_blue = re.findall(pattern_blue,str(soup_list_blue))
items = re.findall(pattern_red,str(soup_list_red))
items_blue = re.findall(pattern_blue,str(soup_list_blue))
#print(items)
#print(items_blue)
red_ball = ','.join(items)
blue_ball = ','.join(items_blue)
#print(red_ball)
#print(blue_ball)
info['红球号码']=red_ball
info['蓝球号码']=blue_ball
return info
#列表dlt用来保存数据,元素为字典类型
dlt=[]
for i in range(1,130):
url = 'https://www.caibow.com/dlt/kj/p{num}/'.format(num=i)
soup = getsoup(url)
for spans in soup.select('.pd_10_20.bb_das '):
dlt.append(get_one_data(spans))
print(dlt[2])
print(len(dlt))
order = ['大乐透期数', '日期', '奖池滚存(元)', '红球号码','蓝球号码']
df = pandas.DataFrame(dlt)
df = df[order]
df.to_excel('Dlt.xlsx')