#!/usr/bin/env python
# -*- coding=utf-8 -*-
# coding:utf-8
from urllib import request
import os
import sys
import random
import gzip
from io import BytesIO
import chardet
import re
import csv
# 爬取的目标网址:http://kaijiang.500.com/shtml/ssq/19056.shtml
class Crawler:
def __init__(self, url, path=os.getcwd(), pages=int(input('请输入要爬取多少页:')),
headers=None, protocol='http', iplist=None):
self.url = url
self.path = path # 爬取数据的保存路径,默认为当前路径
self.pages = pages # 要爬取的页数
self.headers = headers # 请求headers,type是Dict
self.protocol = protocol
self.iplist = iplist # 存放代理IP和端口的list,如['192.168.0.1:80', [192.168.0.1:88]]
# def set_rule(self):
# page_url = self.get_page(url=self.url)
# url_rule = self.url + page_url + '.shtml'
# return url_rule
def set_agent(self):
# 根据传入的protocol,创建代理对象,并返回openner
if self.protocol.lower() == 'http':
proxy_http = request.ProxyHandler({'http': random.choice(self.iplist)})
# 创建一个http协议的openner
openner = request.build_opener(proxy_http)
elif self.protocol.lower() == 'https':
proxy_https = request.ProxyHandler({'https': random.choice(self.iplist)})
# 创建一个https协议的openner
openner = request.build_opener(proxy_https)
else:
print('请重新输入protocol')
sys.exit()
# 追加headers伪装成浏览器
openner.addheaders = [self.headers]
print('openner是:' + openner)
return openner
def url_open(self):
# 先判断是否使用代理,如果iplist为空,则不使用代理,否则使用代理
if self.iplist is None:
print("不使用代理")
else:
openner = self.set_agent()
# 安装一个openner
request.install_opener(openner)
req = request.Request(url=self.url, headers=self.headers)
html = request.urlopen(req).read()
# 因为请求头中是Accept-Encoding: gzip, deflate,故需要解压
html = gzip.GzipFile(fileobj=BytesIO(html)).read()
# 根据不同网站编码来解码html,若解码失败,返回reason
try:
html = html.decode('UTF-8') # html是bytes类型的字符串
except UnicodeDecodeError as reason:
print(reason)
html = html.decode('GBK')
except UnicodeDecodeError as reason:
print(reason)
return html
def get_page(self):
"""
:param pages: 要爬取多少页
:return: 每页的页码组成的list
<a href="javascript:void(0)" class="iSelect" id="change_date">19056</a>
需要爬取的页码为19056
Python find() 方法检测字符串中是否包含子字符串 str ,如果指定 beg(开始) 和 end(结束) 范围,
则检查是否包含在指定范围内,如果包含子字符串返回开始的索引值,否则返回-1。
"""
# a = html.find(b'change_date') + 13
# b = html.find(b'</a', a)
# 将bytes类型转换成str,并返回
# print(str(html[a:b], encoding='utf-8'))
# return str(html[a:b], encoding='utf-8')
# 获取页码
html = self.url_open()
pattern_page_0 = re.compile(r'change_date">\d{5}</a>')
pattern_page_1 = re.compile(r'\d{5}')
page_num = pattern_page_1.findall(''.join(pattern_page_0.findall(html)))
return int(''.join(page_num))
def find_target(self):
# 根据url_new,找到爬取目标
html = self.url_open()
"""
要查找的内容如下:
<li class="ball_red">13</li>
<li class="ball_red">14</li>
<li class="ball_red">17</li>
<li class="ball_red">19</li>
<li class="ball_red">21</li>
<li class="ball_red">29</li>
<li class="ball_blue">01</li>
"""
# 查找红球
pattern_red_0 = re.compile(r'<li class="ball_red">\d{2}</li>')
pattern_red_1 = re.compile(r'\d{2}')
ball_red = pattern_red_1.findall(''.join(pattern_red_0.findall(html)))
# 查找绿球
pattern_blue_0 = re.compile(r'<li class="ball_blue">\d{2}</li>')
pattern_blue_1 = re.compile(r'\d{2}')
ball_blue = pattern_blue_1.findall(''.join(pattern_blue_0.findall(html)))
# 双色球
ball = ball_red + ball_blue
return ball
def save_target(self, data):
"""
保存爬取目标到csv
:param path: 文件路径
:param data: 要保存的数据
:return: 返回一个文件对象
"""
with open(self.path + '\\crawler.csv', 'w') as cf:
# 初始化写入对象
writer = csv.writer(cf, newline='') # newline参数控制每一行写入方式,默认隔行写入
for p in range(self.pages): # 将每页数据写入csv文件
writer.writerow(['期数', '第一个红球', '第二个红球', '第三个红球',
'第四个红球', '第五个红球', '第六个红球', '绿球'])
writer.writerow(data)
return cf
if __name__ == '__main__':
url = 'http://kaijiang.500.com/shtml/ssq/'
headers_init = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64)'
' AppleWebKit/537.36 (KHTML, like Gecko)'
' Chrome/55.0.2883.87 UBrowser/6.2.4098.3 Safari/537.36'}
crawler_init = Crawler(url=url, headers=headers_init)
page_num = crawler_init.get_page()
for i in range(crawler_init.pages):
url = 'http://kaijiang.500.com/shtml/ssq/' + str(page_num-i) + '.shtml'
crawler_new = Crawler(url=url, headers=headers_init)
ball = crawler_new.find_target()
ball.insert(0, page_num-i) # 将期数插入ball列表中,并将组成的新list作为数据保存到csv文件
crawler_new.save_target(data=ball)