day17作业
爬取电影网站信息
import requests
from bs4 import BeautifulSoup
import csv
a = []
for x in range(10):
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36'
}
response = requests.get(f'https://movie.douban.com/top250?start={x * 25}&filter=', headers=headers)
result = response.text
soup = BeautifulSoup(result, 'lxml')
names = soup.select('img')
names.pop()
print(len(names))
score = soup.select('.rating_num')
print(len(score))
people = soup.select('div.star>span')
print(len(people))
evaluation = []
for x in range(0, len(people), 4):
evaluation.append(people[x + 3])
list1 = soup.select('.info>.bd')
list2 = []
for x in list1:
if x.select_one('.inq'):
c = x.select_one('.inq')
list2 += c
else:
list2 += ['']
import re
list3 = soup.select('.info>.bd>p:first-child')
year_list = []
country_list = []
classes_list = []
for x in list3:
year = re.findall(r'<br/>\s*(\d{4}).*?/.*?/', str(x))
year_list += year
country = re.findall(r'<br/>\s*\d{4}.*?/\xa0(.*?)\xa0/', str(x))
country_list += country
classes = re.findall(r'<br/>\s*\d{4}.*?/\xa0.*?\xa0/\xa0(.*?)\s*</p>', str(x))
classes_list += classes
data = map(lambda i1, i2, i3, i4, i5, i6, i7: {'电影名': i1.attrs['alt'], '评分': i2.text, '影评': i3, '评分情况': i4.text,
'播出年份': i5, '拍摄国家': i6, '影片类型': i7}, names, score, list2, evaluation,
year_list, country_list, classes_list)
a += list(data)
f = open('files/data.csv', 'w', encoding='utf-8', newline='')
w1 = csv.DictWriter(f, ['电影名', '评分', '评分情况', '影评', '播出年份', '拍摄国家', '影片类型'])
w1.writeheader()
w1.writerows(a)