'''
1、查看所抓数据在响应中是否存在
右键->查看网页源代码->搜索关键字
2、查找并分析url地址规律
第1页:https://www.maoyan.com/board/4?offset=0
第2页:https://www.maoyan.com/board/4?offset=10
第n页:https://www.maoyan.com/board/4?offset=(n-1)*10
3、编写正则表达式
4、定义程序结构,完善程序
'''
#书写正则表达式
'''
<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>
'''
from urllib import request
import time
import random
import re
import pymysql
class MaoyanSpider:
def __init__(self):
'''定义常用变量'''
self.url = 'https://maoyan.com/board/4?offset={}'
self.headers = {
'User-Agen':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.72 Safari/537.36',
'Cookie':'__mta=147616392.1637811587010.1637811593052.1637811756578.3; uuid_n_v=v1; uuid=547E75D04DA111ECBB63453B2950931D5FF9E0E1483B4DA6B756E833582EB4C1; _csrf=5c8f2ce22de8202f5532da923e897e8482e97f3b030e9a53aac99f4125e10599; _lxsdk_cuid=17d46a5abcec8-03ce73033d88ae-978183a-1fa400-17d46a5abcec8; _lxsdk=547E75D04DA111ECBB63453B2950931D5FF9E0E1483B4DA6B756E833582EB4C1; _lx_utm=utm_source%3DBaidu%26utm_medium%3Dorganic; Hm_lvt_703e94591e87be68cc8da0da7cbd0be2=1637567803,1637575278,1637810301,1637811753; __mta=147616392.1637811587010.1637811593052.1637811755085.3; Hm_lpvt_703e94591e87be68cc8da0da7cbd0be2=1637811757; _lxsdk_s=17d552d7c2e-6ee-c60-2c9%7C%7C13'
}
#添加计数变量
self.i = 0
#连接数据库+创建游标
self.db = pymysql.connect(host='localhost',user='root',password='123456',database='maoyandb',charset='utf8')
self.cursor = self.db.cursor()
#创建一个大列表,用于存储所有电影信息
self.all_film_li = []
def get_html(self,url):
'''获取响应内容'''
req = request.Request(url=url,headers=self.headers)
res = request.urlopen(req)
html = res.read().decode()
# 直接调用解析函数
self.parse_html(html)
def parse_html(self,html):
'''定义解析函数'''
regex = '<div class="movie-item-info">.*?title="(.*?)".*?<p class="star">(.*?)</p>.*?<p class="releasetime">(.*?)</p>'
pattern = re.compile(regex,re.S)
r_list = pattern.findall(html)
# 直接调用数据处理的函数
self.save_html(r_list)
def save_html(self,r_list):
'''数据处理函数'''
self.ins = 'insert into maoyantab value(%s,%s,%s)'
for r in r_list:
film_t = (
r[0].strip(),
r[1].strip(),
r[2].strip()
)
self.i += 1
self.all_film_li.append(film_t)
print(film_t)
def run(self):
'''程序入口'''
for offset in range(0,91,10):
url = self.url.format(offset)
self.get_html(url)
#控制数据抓取频率
time.sleep(random.randint(1,2))
#提交数据、关闭游标、关闭数据库连接
self.cursor.executemany(self.ins, self.all_film_li)
self.db.commit()
self.cursor.close()
self.db.close()
if __name__ == '__main__':
spider = MaoyanSpider()
spider.run()
print('电影数量',spider.i)
09-09
551