import re
from time import sleep
import requests
from lxml import etree
import random
import csv
def main(page, f):
url = f'https://movie.douban.com/top250?start={page * 25}&filter='
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 Safari/537.36', }
resp = requests.get(url, headers=headers)
tree = etree.HTML(resp.text)
href_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/@href')
name_list = tree.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/span[1]/text()')
for url, name in zip(href_list, name_list):
f.flush()
try:
get_info(url, name,)
except:
pass
sleep(1 + random.random())
print(f'第{i + 1}页爬取完毕')
def get_info(url, name):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.35 Safari/537.36',
'Host': 'movie.douban.com', }
resp = requests.get(url, headers=headers)
html = resp.text
tree = etree.HTML(html)
dir = tree.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
type_ = re.findall(r'property="v:genre">(.*?)</span>', html)
type_ = '/'.join(type_)
country = re.findall(r'地区:</span> (.*?)<br', html)[0]
time = tree.xpath('//*[@id="content"]/h1/span[2]/text()')[0]
time = time[1:5]
rate = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
people = tree.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')[0]
print(name, dir, type_, country, rate, )
csvwriter.writerow((name, dir, type_, country, rate,))
if __name__ == '__main__':
with open('学号_姓名.csv', 'a', encoding='utf-8', newline='') as f:
csvwriter = csv.writer(f)
csvwriter.writerow(('电影名称', '导演', '电影类型', '国家', '评分', ))
for i in range(0,10):
main(i, f)
sleep(3 + random.random())
Python爬虫小练习1
于 2024-11-26 19:23:51 首次发布