获取mysql增量数据库_Spider -- MySQL数据库之增量爬取

最新推荐文章于 2023-10-13 16:16:18 发布

weixin_39644952

最新推荐文章于 2023-10-13 16:16:18 发布

阅读量228

点赞数

文章标签：获取mysql增量数据库

版权声明：本文为博主原创文章，遵循 CC 4.0 BY-SA 版权协议，转载请附上原文出处链接和本声明。

本文链接：https://blog.csdn.net/weixin_39644952/article/details/113192071

版权

from urllib importrequestimportrefrom useragents importua_list　　# 自己个人写的模块，提供随机User-Agentimporttimeimportrandomimportpymysqlfrom hashlib importmd5

classFilmSkySpider(object):def __init__(self):#一级页面url地址

self.url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'self.db= pymysql.connect('localhost', 'root', '123456', 'filmskydb', charset='utf8')

self.cursor=self.db.cursor()

#获取html功能函数

defget_html(self, url):

headers={'User-Agent': random.choice(ua_list)

}

req= request.Request(url=url, headers=headers)

res=request.urlopen(req)#通过网站查看网页源码,查看网站charset='gb2312'

#如果遇到解码错误,识别不了一些字符,则 ignore 忽略掉

html = res.read().decode('gb2312', 'ignore')

returnhtml

#正则解析功能函数

defre_func(self, re_bds, html):

pattern=re.compile(re_bds, re.S)

r_list=pattern.findall(html)

returnr_list

#获取数据函数

defparse_page(self, one_url):

html=self.get_html(one_url)

re_bds= r'

'

#one_page_list: ['/html/xxx','/html/xxx','/html/xxx']

one_page_list =self.re_func(re_bds, html)

for href inone_page_list:

two_url= 'https://www.dytt8.net' +href#生成指纹 - md5加密

s =md5()

s.update(two_url.encode())

two_url_md5=s.hexdigest()#判断链接是否需要抓取

ifself.is_go_on(two_url_md5):

self.parse_two_page(two_url)#爬取完成此链接后将指纹放到数据库表中

ins = 'insert into request_finger values(%s)'self.cursor.execute(ins, [two_url_md5])

self.db.commit()#uniform: 浮点数,爬取1个电影信息后sleep

time.sleep(random.uniform(1, 3))

defis_go_on(self, two_url_md5):#爬取之前先到数据库中查询比对

sel = 'select finger from request_finger where finger=%s'

#开始抓取之前,先来判断该链接之前是否抓取过

result =self.cursor.execute(sel, [two_url_md5])if notresult:returnTrue

#解析二级页面数据

defparse_two_page(self, two_url):

item={}

html=self.get_html(two_url)

re_bds= r'

(.*?)

.*?

weixin_39644952

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
获取mysql增量数据库_Spider -- MySQL数据库之增量爬取

from urllib importrequestimportrefrom useragents importua_list　　# 自己个人写的模块，提供随机User-Agentimporttimeimportrandomimportpymysqlfrom hashlib importmd5classFilmSkySpider(object):def __init__(self):#一级页面u...
复制链接

扫一扫

评论

被折叠的条评论为什么被折叠?

到【灌水乐园】发言

查看更多评论

添加红包

成就一亿技术人!

hope_wisdom

发出的红包

实付元

使用余额支付

点击重新获取

扫码支付

钱包余额 0

抵扣说明：

1.余额是钱包充值的虚拟货币，按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载，可以购买VIP、付费专栏及课程。