提示:文章写完后,目录可以自动生成,如何生成可参考右边的帮助文档
前言
例如:这次一篇爱玩双色球朋友的福利,可以以往所有期的数据的插入数据库.可以对数据进行建模分析,祝愿所有朋友中奖,中一等奖别忘记给博主打赏.
一、BeautifulSoup是什么?
示例:BeautifulSoup是python的爬虫依赖包,用pip按照 beautifulsoup4包.
pip install beautifulsoup4
二、使用步骤
1.数据库脚本
代码如下(示例):
-- ----------------------------
-- Table structure for tb_dou_color_data
-- ----------------------------
DROP TABLE IF EXISTS `tb_dou_color_data`;
CREATE TABLE `tb_dou_color_data` (
`id` int NOT NULL AUTO_INCREMENT COMMENT '主键',
`period` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '期号',
`time` varchar(100) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '日期',
`red_one` int DEFAULT NULL COMMENT '红1',
`red_two` int DEFAULT NULL COMMENT '红2',
`red_three` int DEFAULT NULL COMMENT '红3',
`red_four` int DEFAULT NULL COMMENT '红4',
`red_five` int DEFAULT NULL COMMENT '红5',
`red_six` int DEFAULT NULL COMMENT '红6',
`blue` int DEFAULT NULL COMMENT '蓝1',
`signal_list` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '个数集',
`signal_list_count` int DEFAULT NULL COMMENT '个数集个数',
`red_sum` int DEFAULT NULL COMMENT '红合数',
`signal_model` varchar(255) CHARACTER SET utf8mb4 COLLATE utf8mb4_0900_ai_ci DEFAULT NULL COMMENT '红数字模型',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=3463 DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci ROW_FORMAT=DYNAMIC COMMENT='双色球数据';
2.爬的网站地址
代码如下(示例):
https://kaijiang.500.com/static/info/kaijiang/shtml/ssq/
该处使用的url网络请求的数据。
3.python脚本
import requests
import xlwt
import time
import base64
from bs4 import BeautifulSoup
import pymysql
# 获取网页内容
def get_html(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
}
try:
response = requests.get(url, headers = headers)
response.encoding = "gb2312"
if response.status_code == 200:
# 判断是否为已经超过最大的基线()
resoptext = BeautifulSoup(response.text, 'html.parser',fromEncoding="gb18030")
isFlag = resoptext.select('.kj_tablelist02')[0].select('tr')[1].select('li')[0].text
if isFlag == '':
return None
return response.text
else:
print('--------------------')
print(url)
print('无数据!')
print('--------------------')
return None
except requests.exceptions.RequestException :
print('--------------------')
print(url)
print('超时')
print('--------------------')
# 解析网页内容
def parse_html(html):
soup = BeautifulSoup(html, 'html.parser')
# 设置第6个 div trhtml[0].select('td')[i].select('.span_right').text
print(soup.select('.kj_tablelist02')[0].select('tr'))
trhtml = soup.select('.kj_tablelist02')[0].select('tr')
#print(soup.select('.kj_tablelist02')[0].select('tr'))
print("--------------")
print(trhtml[0].select('span')[0].text)
print(trhtml[0].select('span')[1].text)
print(trhtml[1].select('li')[0].text)
yield{
'time':trhtml[0].select('span')[1].text[5:15],
'issue':trhtml[0].select('span')[0].text[6:12],
'one' : trhtml[1].select('li')[0].text,
'two' : trhtml[1].select('li')[1].text,
'three' : trhtml[1].select('li')[2].text,
'four' : trhtml[1].select('li')[3].text,
'five' : trhtml[1].select('li')[4].text,
'six' : trhtml[1].select('li')[5].text,
'blue' : trhtml[1].select('li')[6].text,
}
# 将数据写入excel表
def write_to_excel():
f = xlwt.Workbook(encoding = 'gb2312')
for kk in range(3,21):
yearSheet(f,kk)
f.save('fuli.xls')
#conn = pymysql.connect(host="127.0.0.1",user="root",passwd="changkun123",db="fuli")
#sql ="show tables"
#print(str(conn.query(sql)))
def insert_sql():
# 将查询到数据放入mysql
db = pymysql.connect('localhost','root','gaoshun123','fuli')
#插入数据语句
query = """insert into tb_dou_color_data (period,time,red_one,red_two,red_three,red_four,red_five,red_six,blue,signal_list,signal_list_count,red_sum,signal_model) values (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""
#query = "insert into tb_dou_color_data (id, time, period,red_one,red_two,red_three,red_four,red_five,red_six,blue,signal_list,signal_list_count,red_sum,signal_model) values (1,'1','1',1,1,1,1,1,1,1,'1',1,1,'1')"
cursor = db.cursor()
#迭代读取每行数据
#values中元素有个类型的强制转换,否则会出错的
#应该会有其他更合适的方式,可以进一步了解
data = readHtmlFrom()
print(data)
print(query)
cursor.executemany(query, data)
# 关闭连接
cursor.close()
db.commit()
db.close()
def readHtmlFrom():
arrayListCollection = list()
#从03年开始-20年
for yearStr in range(20,21):
for k in range(1, 160):
ff =''
ff= "%03d" % k
print(ff)
yearUrl =''
yearUrl ="%02d" % yearStr
url = 'http://kaijiang.500.com/shtml/ssq/'+ str(yearUrl) +'{}.shtml'.format(ff)
html = get_html(url)
# 避免
time.sleep(0.5)
if html != None:
for item in parse_html(html):
hashSet = list()
hashSet.append(item['time'])
hashSet.append(item['issue'])
hashSet.append(int(item['one']))
hashSet.append(int(item['two']))
hashSet.append(int(item['three']))
hashSet.append(int(item['four']))
hashSet.append(int(item['five']))
hashSet.append(int(item['six']))
hashSet.append(int(item['blue']))
# 运算个数集
hashSetStr = set()
print('+++++++++++++++')
hashSetStr.add(int(item['one']) % 10)
hashSetStr.add(int(item['two']) % 10)
hashSetStr.add(int(item['three']) % 10)
hashSetStr.add(int(item['four']) % 10)
hashSetStr.add(int(item['five']) % 10)
hashSetStr.add(int(item['six']) % 10)
print(hashSetStr)
hashSet.append(str(sorted(hashSetStr)))
# 长度
hashSet.append(int(len(hashSetStr)))
# 合数
hashSet.append(int(item['one'])+ int(item['two'])+int(item['three'])+int(item['four'])+int(item['five'])+int(item['six']))
# 数字模型
arrayList = []
arrayList.append(tranfer(int(item['one'])))
arrayList.append(tranfer(int(item['two'])))
arrayList.append(tranfer(int(item['three'])))
arrayList.append(tranfer(int(item['four'])))
arrayList.append(tranfer(int(item['five'])))
arrayList.append(tranfer(int(item['six'])))
# 放入集合中
hashSet.append(str(arrayList))
element = tuple(hashSet)
# 放入总的数据结构
arrayListCollection.append(element)
return arrayListCollection
def yearSheet(f,yearStr):
sheet1 = f.add_sheet(str(yearStr), cell_overwrite_ok=True)
row0 = ['日期','期号','红1','红2','红3','红4','红5','红6','蓝1','个数集','合数','数字模型']
# 写入第一行
for j in range(0, len(row0)):
sheet1.write(0, j, row0[j])
# 依次爬取每一个网页,将结果写入excel
i = 0
for k in range(1, 160):
ff =''
ff= "%03d" % k
print(ff)
yearUrl =''
yearUrl ="%02d" % yearStr
url = 'http://kaijiang.500.com/shtml/ssq/'+ str(yearUrl) +'{}.shtml'.format(ff)
#url = 'http://kaijiang.zhcw.com/zhcw/html/3d/list_{}.html'.format(k)
print(url)
html = get_html(url)
time.sleep(0.5)
#print(html)
print('正在保存第{}页......'.format(k))
if html != None:
# 写入每一期信息
for item in parse_html(html):
sheet1.write(i+1, 0, item['time'])
sheet1.write(i+1, 1, item['issue'])
sheet1.write(i+1, 2, item['one'])
sheet1.write(i+1, 3, item['two'])
sheet1.write(i+1, 4, item['three'])
sheet1.write(i+1, 5, item['four'])
sheet1.write(i+1, 6, item['five'])
sheet1.write(i+1, 7, item['six'])
sheet1.write(i+1, 8, item['blue'])
# 运算个数集
hashSet = set()
print('+++++++++++++++')
hashSet.add(int(item['one']) % 10)
hashSet.add(int(item['two']) % 10)
hashSet.add(int(item['three']) % 10)
hashSet.add(int(item['four']) % 10)
hashSet.add(int(item['five']) % 10)
hashSet.add(int(item['six']) % 10)
print(hashSet)
# 转成字符串
sheet1.write(i+1, 9,str(sorted(hashSet)))
# 数据集长度
sheet1.write(i+1,10,len(hashSet))
# 合数
sheet1.write(i+1, 11,int(item['one'])+ int(item['two'])+int(item['three'])+int(item['four'])+int(item['five'])+int(item['six']))
# 数字模型
arrayList = []
arrayList.append(tranfer(int(item['one'])))
arrayList.append(tranfer(int(item['two'])))
arrayList.append(tranfer(int(item['three'])))
arrayList.append(tranfer(int(item['four'])))
arrayList.append(tranfer(int(item['five'])))
arrayList.append(tranfer(int(item['six'])))
#arrayList.sort()
sheet1.write(i+1, 12,str(arrayList))
i += 1
def tranfer(num) :
if num<10 :
return 'X'
elif 10<= num <20 :
return '1X'
elif 20<= num <30 :
return '2X'
elif num >=30:
return '3X'
def main():
# 写入xml
# write_to_excel()
insert_sql()
if __name__ == '__main__':
main()