简介:学习爬虫爬取豆瓣网第一页代码,以及心得分析
1导入模块
#导入requests、re模块
import requests
import re
2请求与响应,[headers作用]
(https://blog.csdn.net/u011031422/article/details/77156507?)
#请求头
headers = {
'Referer': 'https://img3.doubanio.com/dae/accounts/resources/d3e2921/movie/bundle.css',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 '
}
#地址
url = 'https://movie.douban.com/top250'
#请求‘响应
resp = requests.get(url,headers = headers)
#打印,转化为文本文档的resp
print(resp.text)
3正则表达式使用
obj = re.compile(r'<li>.*?<div class="item">.*?<span class="title">(?P<name>.*?)'
r'</span>.*?<p class="">.*?<br>(?P<year>.*?) .*?<span '
r'class="rating_num" property="v:average">(?P<score>.*?)</span>.*?'
r'<span>(?P<n>.*?)人评价</span>',re.S)
result = obj.finditer(resp.text)
#name电影名,year年份,n评论人数,score分数
for i in result:
print(i.group('name'))
print(i.group('year').strip())
print(i.group('n'))
print(i.group('score'))
结果显示前2个: