re正则实战-豆瓣Top250
一、前言
在上文我们讲解了re正则表达式在python中的各种应用,在本文我们将以下面的例子带着大家去看看在实战中是如何运用的。
目标url:https://movie.douban.com/top250?start=0&filter=
二、数据分析
①:现在源代码中查看是否存在我们需要数据
从下图可以发现,我们需要的数据就在源代码当中,那我们直接请求该网址的url即可拿到原始数据
②:每一页只有25条记录,如何实现翻页效果呢?
点击下一页,观察url的变化
https://movie.douban.com/top250?start=0&filter=
https://movie.douban.com/top250?start=25&filter=
https://movie.douban.com/top250?start=50&filter=
从上文可以发现,通过改变url的param参数start
的值可以实现翻页效果,那么只需要设定好值就可以实现翻页效果
三、数据获取及展示
ps:豆瓣官方为了应对爬虫,当遭受到频繁请求时会要求通过验证码继续浏览。本文解救方案采取通过人工过验证码然后拿到正确的cookie,替换cookie继续发送请求拿到数据。
数据获取
①:每一页参数获取的代码
def parse_url(url):
while True:
print(f"开始请求{url}")
time.sleep(3)
url = url
UA = random.choice(user_agent_list)
# print(UA)
_headers = {
'Cookie': 'bid=QWUV40oy-FY; _pk_id.100001.4cf6=f2805921a5ad8609.1685872579.; ll="108309"; _vwo_uuid_v2=D12C7D490102E90C58FD2D45CA5ACB05B|01cb4d51178de9859d5df7992ca64578; douban-fav-remind=1; __utmv=30149280.27358; __utma=223695111.363614075.1685872580.1692524924.1692526991.12; viewed="35264301"; __utma=30149280.2127834450.1685872580.1695305505.1700229256.17; __utmz=30149280.1700229256.17.12.utmcsr=cn.bing.com|utmccn=(referral)|utmcmd=referral|utmcct=/; _pk_ses.100001.4cf6=1; ap_v=0,6.0',
"User-Agent": random.choice(user_agent_list)
}
resp = requests.get(url=url, headers=_headers)
content = resp.text
# print(content)
if '你访问豆瓣的方式有点像机器人程序。为了保护用户的数据,请向我们证明你是人类:' not in content:
html = etree.HTML(content)
# 爬取数据所需
obj_director = re.compile('导演: (.*?)\xa0', re.S)
obj_actor = re.compile('主演: (.*)', re.S)
obj_year = re.compile('(.*?)\xa0', re.S)
boj_country = re.compile('\xa0/\xa0(.*?)\xa0/\xa0', re.S)
data_list = html.xpath('//ol[@class="grid_view"]//li')
# print(data_list)
for data in data_list:
movie_rank = data.xpath('.//em[@class=""]/text()')[0]
movie_name = data.xpath('.//span[@class="title"]/text()')
# data.xpath('.//p[@class=""]/text()')返回内容
# ['\n 导演: 马丁·布莱斯 Martin Brest\xa0\xa0\xa0主演: 阿尔·帕西诺 Al Pacino / 克里斯...', '\n 1992\xa0/\xa0美国\xa0/\xa0剧情\n ']
movie_director = data.xpath('.//p[@class=""]/text()')[0]
movie_director = obj_director.findall(movie_director)[0]
movie_actor = data.xpath('.//p[@class=""]/text()')[0]
movie_actor = obj_actor.findall(movie_actor)[0]
movie_year = data.xpath('.//p[@class=""]/text()')[1]
movie_year = obj_year.findall(movie_year)[0].strip('\n ')
movie_country = data.xpath('.//p[@class=""]/text()')[1]
movie_country = boj_country.findall(movie_country)[0]
print('\n')
print('电影排名:', movie_rank)
print('电影名称:', movie_name[0])
print('电影导演:', movie_director)
print('电影主演:', movie_actor)
print('电影年份:', movie_year)
print('电影国家:', movie_country)
with open('豆瓣TOP250.txt','a+',encoding='utf-8')as f:
f.write('\n电影排名:'+ movie_rank +'\n电影名称:' + movie_name[0] +'\n电影导演:' + movie_director+'\n电影主演:'+ movie_actor + '\n电影年份:'+ movie_year+'\n电影国家:'+ movie_country+'\n')
break
注意:本文使用的是xpath解析搭配re正则表达式实现数据的获取,纯re正则获取可以参考下面的解析。
# Define regex patterns to extract information
title_pattern = r'<span class="title">(.*?)</span>'
director_pattern = r'导演: (.*?)&'
year_pattern = r'(\d{4}) / (.*?) / '
rating_pattern = r'<span class="rating_num" property="v:average">(.*?)</span>'
votes_pattern = r'(\d+)人评价'
# Extract information using regex
title = re.search(title_pattern, html_snippet, re.DOTALL).group(1)
director = re.search(director_pattern, html_snippet, re.DOTALL).group(1)
year_country_match = re.search(year_pattern, html_snippet, re.DOTALL)
year = year_country_match.group(1)
country = year_country_match.group(2)
rating = re.search(rating_pattern, html_snippet, re.DOTALL).group(1)
votes = re.search(votes_pattern, html_snippet, re.DOTALL).group(1)
# Print extracted information
print(f'Title: {title}')
print(f'Director: {director}')
print(f'Year: {year}')
print(f'Country: {country}')
print(f'Rating: {rating}')
print(f'Votes: {votes}')
或者
obj_data = re.compile('<ol class="grid_view">(.*?)</ol>', re.S)
obj_List = re.compile('<li>(.*?)</li>',re.S)
data_content = obj_data.findall(content)[0]
# print(data_content)
data_list = obj_List.findall(data_content)
obj = re.compile('<em class="">(?P<rank>.*?)</em>'
'(.*?)<span class="title">(?P<name>.*?)</span>(.*?)'
'导演:(?P<director>.*?) (.*?)'
'主演:(?P<actor>.*?)'
'<br>(?P<year>.*?) / (?P<country>.*?) / (?P<type>.*?)</p>'
,re.S)
for data in data_list:
# 爬取数据所需
resp = obj.search(data,re.S)
print('\n')
print('电影排名:', resp.group('rank'))
print('电影名称:', resp.group('name'))
print('电影导演:',resp.group('director'))
print('电影主演:', resp.group('actor'))
print('电影年份:', resp.group('year'))
print('电影国家:', resp.group('country'))
re正则表达式的使用方法很多,可以参考github仓库的其他示例
②:实现翻页
#单网页尝试
cur_page = 1
while True:
print("开始请求")
time.sleep(3)
url = f'https://movie.douban.com/top250?start={(cur_page-1)*25}&filter='
UA = random.choice(user_agent_list)
# print(UA)
_headers = {
'Cookie':'bid=QWUV40oy-FY; _pk_id.100001.4cf6=f2805921a5ad8609.1685872579.; ll="108309"; _vwo_uuid_v2=D12C7D490102E90C58FD2D45CA5ACB05B|01cb4d51178de9859d5df7992ca64578; douban-fav-remind=1; dbcl2="273587585:LGbDVmaWfD8"; push_noty_num=0; push_doumail_num=0; __utmz=30149280.1692524870.12.8.utmcsr=movie.douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/top250; __utmv=30149280.27358; ck=_afb; __utma=30149280.2127834450.1685872580.1692524870.1692526982.13; __utmc=30149280; __utmt=1; __utmb=30149280.2.10.1692526982; __utma=223695111.363614075.1685872580.1692524924.1692526991.12; __utmb=223695111.0.10.1692526991; __utmc=223695111; __utmz=223695111.1692526991.12.9.utmcsr=douban.com|utmccn=(referral)|utmcmd=referral|utmcct=/misc/sorry; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1692526991%2C%22https%3A%2F%2Fwww.douban.com%2Fmisc%2Fsorry%3Foriginal-url%3Dhttps%3A%2F%2Fmovie.douban.com%2Ftop250%3Fstart%3D25%26filter%3D%22%5D; _pk_ses.100001.4cf6=1',
"User-Agent": random.choice(user_agent_list)
}
resp = requests.get(url=url, headers=_headers)
print(resp.status_code)
if resp.status_code != 200 or cur_page>10:
print("状态异常或无数据可爬取")
break
content = resp.text
# print(content)
if '你访问豆瓣的方式有点像机器人程序。为了保护用户的数据,请向我们证明你是人类:' not in content:
html = etree.HTML(content)
# 爬取数据所需
obj_director = re.compile('导演: (.*?)\xa0', re.S)
obj_actor = re.compile('主演: (.*)', re.S)
obj_year = re.compile('(.*?)\xa0', re.S)
boj_country = re.compile('\xa0/\xa0(.*?)\xa0/\xa0',re.S)
data_list = html.xpath('//ol[@class="grid_view"]//li')
#print(data_list)
for data in data_list:
movie_rank = data.xpath('.//em[@class=""]/text()')[0]
movie_name = data.xpath('.//span[@class="title"]/text()')
# data.xpath('.//p[@class=""]/text()')返回内容
# ['\n 导演: 马丁·布莱斯 Martin Brest\xa0\xa0\xa0主演: 阿尔·帕西诺 Al Pacino / 克里斯...', '\n 1992\xa0/\xa0美国\xa0/\xa0剧情\n ']
try:
movie_director = data.xpath('.//p[@class=""]/text()')[0]
except:
movie_director = "空"
try:
movie_director = obj_director.findall(movie_director)[0]
except:
movie_director = "空"
movie_actor = data.xpath('.//p[@class=""]/text()')[0]
try:
movie_actor = obj_actor.findall(movie_actor)[0]
except:
movie_actor = '空'
movie_year = data.xpath('.//p[@class=""]/text()')[1]
movie_year = obj_year.findall(movie_year)[0].strip('\n ')
movie_country = data.xpath('.//p[@class=""]/text()')[1]
movie_country = boj_country.findall(movie_country)[0]
print('\n')
print('电影排名:', movie_rank)
print('电影名称:', movie_name[0])
print('电影导演:', movie_director)
print('电影主演:', movie_actor)
print('电影年份:', movie_year)
print('电影国家:', movie_country)
cur_page +=1
else:
_headers["Cookie"]= str(input("更新cookie值: "))
最终效果展示
四、结语
根据示例,在使用re正则表达式提取数据的时候,最好提前预加载内容的格式,比如
re.compile('导演: (.*?)\xa0', re.S)
下文只需要调用obj_director.findall(movie_director)[0]
即可
本文源码: Python爬虫之路 https://github.com/rosyrain/spider-course
lesson13中。欢迎各位Follow/Star/Fork ( •̀ ω •́ )✧
有任何问题欢迎大家的评论和指正。再次声明,本专栏只做技术探讨,严谨商用,恶意攻击等。
这是我的 GitHub 主页:Rosyrain (github.com) https://github.com/rosyrain
,里面有一些我学习时候的笔记或者代码。本专栏的文档和源码存到spider-course的仓库下。
欢迎大家Follow/Star/Fork三连。