爬取效果如下:
使用工具:
Python 3.9
pycharm
爬虫的一般思路:
1.拿到页面源代码 requests
2.通过re正则来提取想要的有效信息 re
3.csv 是存储格式,方便数据分析
#拿到页面源代码 requests
#通过re正则来提取想要的有效信息 re
#csv 是存储格式,方便数据分析
import requests
import re
import csv
url = "https://movie.douban.com/top250?start=1"
headers = {
"User-agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36 Edg/97.0.1072.62"
}
resp = requests.get(url , headers=headers)
yuandaima = resp.text
#解析数据
obj = re.compile(r'<li>.*?<div class="item">.*? <span class="title">(?P<name>.*?)</span>'
r'.*?<p class="">.*?<br>(?P<year>.*?) '
r'.*?<span class="rating_num" property="v:average">(?P<fen>.*?)</span>'
r'.*?<span>(?P<num>.*?)人评价</span>',re.S)
#开始匹配
result = obj.finditer(yuandaima)
f = open("data.csv", mode="w")
csvwriter = csv.writer(f)
for it in result:
#print(it.group("name"))
#print(it.group("year").strip()) #strip是把前面的空行空格取消掉
#print(it.group("fen"))
#print(it.group("num"))
dic = it.groupdict()
dic['year'] = dic['year'].strip()
csvwriter.writerow(dic.values())
f.close()
print("over!")