python 爬虫案例(数据2000+)

python爬取“大乐透”往期数据

在这里插入图片描述

网页分析

在这里插入图片描述

构造请求头,获取soup

#只是获取单页的内容
import requests
import re
import pandas
from bs4 import BeautifulSoup
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
    'Host':'www.caibow.com'
    }
url = 'https://www.caibow.com/dlt/kj/p1/'
res = requests.get(url,headers=headers,timeout = 10)
soup = BeautifulSoup(res.text,'html.parser')

获取期数标题

#取期数
for spans in soup.select('.pd_10_20.bb_das '):
    #print(spans)
    for dateNum in spans.select('.fl.fz_16'):
        #print(dateNum.text)
        #print(type(dateNum.text))
        num=re.findall(r"\d+",dateNum.text)
        if (len(num)):
            print(num[0])
    print('----------------------------------------')

结果:
在这里插入图片描述

取日期

#取日期
date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?')
for spans in soup.select('.pd_10_20.bb_das '):
    #print(spans)
    for dateNum2 in spans.select('.fr.fz_14'):
        dates  = re.findall(date_re,str(dateNum2))
        #print(type(dateNum2.text))
        if(dates):
            print(dates[0])
    print('----------------------------------------')

在这里插入图片描述

取奖池滚存

#取奖池滚存
#取亿元 /元
#数据的单位不统一,用不同的正则匹配
mnY_re = re.compile('.*?(\d+\.\d+\w+).*?')
mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?')
for spans in soup.select('.pd_10_20.bb_das '):
    for money in spans.select('.fr.fz_14.lh_30'):
        #print(money)
        mone1 = re.findall(mn_re,str(money))
        mone2 = re.findall(mnY_re,str(money))
        if(mone1):
            print(mone1[0])
        elif(mone2):
            print(mone2[0])
        else:
            print('null')  
    print('----------------------------------------')

在这里插入图片描述

取红篮球号

#用列表字典来保存结果,方便后面制表
info = {}
d = []
#取红球蓝球
#soup_list_red = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
#soup_list_blue = soup.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?')
pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?')
for spans in soup.select('.pd_10_20.bb_das '):
    #print(spans)
    info={}
    soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
    soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
    
    items = re.findall(pattern_red,str(soup_list_red))  
    items_blue = re.findall(pattern_blue,str(soup_list_blue)) 
    items = re.findall(pattern_red,str(soup_list_red))  
    items_blue = re.findall(pattern_blue,str(soup_list_blue))
    print(items)
    print(items_blue)
    red_ball = ','.join(items)
    blue_ball = ','.join(items_blue)
    print(red_ball)
    print(blue_ball)
    info['红球']=red_ball
    info['蓝球']=blue_ball
    d.append(info)
    print('----------------------------------------')

在这里插入图片描述

完整代码

#coding:utf-8
'''
Created on 2019年12月21日
@author: liu yan
'''

import requests
import re
import pandas
from bs4 import BeautifulSoup
headers = {
    'user-agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3732.400 QQBrowser/10.5.3819.400',
    'Host':'www.caibow.com'
    }
#url = 'https://www.caibow.com/dlt/kj/p1/'
#res = requests.get(url,headers=headers,timeout = 10)
#soup = BeautifulSoup(res.text,'html.parser')

def getsoup(url):
    #打开链接
    res = requests.get(url,headers=headers,timeout = 10)
    soup = BeautifulSoup(res.text,'html.parser')
    return soup

def get_one_data(spans):
    info = {}
    #取期数
    for dateNum in spans.select('.fl.fz_16'):
        #print(dateNum.text)
        #print(type(dateNum.text))
        num=re.findall(r"\d+",dateNum.text)
        if (len(num)):
              info['大乐透期数']=num[0]
                
    #取日期
    date_re = re.compile('.*?(\d{4}-\d{1,2}-\d{1,2}).*?')
    for dateNum2 in spans.select('.fr.fz_14'):
        dates  = re.findall(date_re,str(dateNum2))
        #print(dateNum2.text)
        #print(type(dateNum2.text))
        if(dates):
            info['日期']=dates[0]
    
    #取奖池滚存
    mnY_re = re.compile('.*?(\d+\.\d+\w+).*?')
    mn_re = re.compile('.*?(\d+\,\d+\,\d+\,\d+\w+).*?')
    mn_re2 = re.compile('.*?(\d+\,\d+\,\d+\w+).*?')
    for money in spans.select('.fr.fz_14.lh_30'):
        #print(money)
        mone1 = re.findall(mn_re,str(money))
        mone2 = re.findall(mnY_re,str(money))
        mone3 = re.findall(mn_re2,str(money))
        if(mone1):
            info['奖池滚存(元)']=mone1[0]
        elif(mone2):
            info['奖池滚存(元)']=mone2[0]
        elif(mone3):
            info['奖池滚存(元)']=mone3[0]
        else:
            info['奖池滚存(元)']='null'
    
    #取球号
    pattern_red = re.compile('.*?<span class="fl all_ball red_ball color_white mr10">(\d+)</span>.*?')
    pattern_blue = re.compile('.*?<span class="fl all_ball blue_ball color_white mr10">(\d+)</span>.*?')
    
    #print(spans)
    soup_list_red = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="red"))
    soup_list_blue = spans.find_all('span', class_='fl all_ball {color}_ball color_white mr10'.format(color="blue"))
    items = re.findall(pattern_red,str(soup_list_red))  
    items_blue = re.findall(pattern_blue,str(soup_list_blue)) 
    items = re.findall(pattern_red,str(soup_list_red))  
    items_blue = re.findall(pattern_blue,str(soup_list_blue))
    #print(items)
    #print(items_blue)
    red_ball = ','.join(items)
    blue_ball = ','.join(items_blue)
    #print(red_ball)
    #print(blue_ball)
    info['红球号码']=red_ball
    info['蓝球号码']=blue_ball

    return info

#列表dlt用来保存数据,元素为字典类型
dlt=[]
for i in range(1,130):
    url = 'https://www.caibow.com/dlt/kj/p{num}/'.format(num=i)
    soup = getsoup(url)
    for spans in soup.select('.pd_10_20.bb_das '):
        dlt.append(get_one_data(spans))
print(dlt[2])
print(len(dlt))
order = ['大乐透期数', '日期', '奖池滚存(元)', '红球号码','蓝球号码']
df = pandas.DataFrame(dlt) 
df = df[order]
df.to_excel('Dlt.xlsx')

在这里插入图片描述
在这里插入图片描述

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值