题目:
获取豆瓣TOP250电影的中英文名、港台名、导演、上映年份、电影分类以及评分,将数据存入文档。
代码:
import codecs
import csv
import re
from requests_html import HTMLSession
base_url = "h t t p s://movie.douban.com/top250?start=" # 自己处理一下空格,没空格发不了
# 获取请求对象
session = HTMLSession()
result_list = []
def filter(s):
if s:
s = s.strip()
s = re.sub('\s+', ' ', s)
s = s.replace(" / ", "/")
if s[0] == "/":
s = s[1:]
s = s.strip()
return s
for i in range(0, 10):
url = base_url + str(i * 25)
response = session.get(url)
html = response.html
div = html.xpath('//*[@class="grid_view"]', first=True)
div_list = div.xpath('//li')
for d in div_list:
z_name = filter(d.xpath('//div/div[2]/div[1]/a/span[1]/text()', first=True))
e_name = filter(d.xpath('//div/div[2]/div[1]/a/span[2]/text()', first=True))
g_name = filter(d.xpath('//div/div[2]/div[1]/a/span[3]/text()', first=True))
info_list = d.xpath('//div/div[2]/div[2]/p/text()')
if len(info_list) > 1:
txt1 = info_list[0].strip()
if "主" in txt1:
director = txt1[3:txt1.index("主")].strip()
else:
director = txt1[3:]
txt_info = info_list[1].strip().split("/")
year = txt_info[0].strip()
_type = txt_info[2].strip()
rating_num = d.xpath('.//span[@class="rating_num"]/text()', first=True)
result_list.append({
"中文名": z_name,
"英文名": e_name,
"港台名": g_name,
"导演": director,
"上映年份": year,
"分类": _type,
"评分": rating_num,
})
else:
print(url, z_name, info_list)
csv_file = codecs.open('data/movie_test.csv', 'w', encoding='utf-8')
writer = csv.writer(csv_file)
for d in result_list:
writer.writerow((d['中文名'], d['英文名'], d['港台名'], d['导演'], d['上映年份'], d['分类'], d['评分']))
csv_file.close()
输出结果: