from urllib importrequestimportrefrom useragents importua_list # 自己个人写的模块,提供随机User-Agentimporttimeimportrandomimportpymysqlfrom hashlib importmd5
classFilmSkySpider(object):def __init__(self):#一级页面url地址
self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'self.db= pymysql.connect('localhost', 'root', '123456', 'filmskydb', charset='utf8')
self.cursor=self.db.cursor()
#获取html功能函数
defget_html(self, url):
headers={'User-Agent': random.choice(ua_list)
}
req= request.Request(url=url, headers=headers)
res=request.urlopen(req)#通过网站查看网页源码,查看网站charset='gb2312'
#如果遇到解码错误,识别不了一些字符,则 ignore 忽略掉
html = res.read().decode('gb2312', 'ignore')
returnhtml
#正则解析功能函数
defre_func(self, re_bds, html):
pattern=re.compile(re_bds, re.S)
r_list=pattern.findall(html)
returnr_list
#获取数据函数
defparse_page(self, one_url):
html=self.get_html(one_url)
re_bds= r'
'#one_page_list: ['/html/xxx','/html/xxx','/html/xxx']
one_page_list =self.re_func(re_bds, html)
for href inone_page_list:
two_url= 'https://www.dytt8.net' +href#生成指纹 - md5加密
s =md5()
s.update(two_url.encode())
two_url_md5=s.hexdigest()#判断链接是否需要抓取
ifself.is_go_on(two_url_md5):
self.parse_two_page(two_url)#爬取完成此链接后将指纹放到数据库表中
ins = 'insert into request_finger values(%s)'self.cursor.execute(ins, [two_url_md5])
self.db.commit()#uniform: 浮点数,爬取1个电影信息后sleep
time.sleep(random.uniform(1, 3))
defis_go_on(self, two_url_md5):#爬取之前先到数据库中查询比对
sel = 'select finger from request_finger where finger=%s'
#开始抓取之前,先来判断该链接之前是否抓取过
result =self.cursor.execute(sel, [two_url_md5])if notresult:returnTrue
#解析二级页面数据
defparse_two_page(self, two_url):
item={}
html=self.get_html(two_url)
re_bds= r'