爬虫我们用requests模块爬取,数据处理用xpath方法,再写到csv文件里面就行
import csv
import requests
from lxml import html
etree = html.etree
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.51'}
url_total = []
def url_get(url):
response = requests.get(url=url, headers=headers).text
et = etree.HTML(response)
movie_url = et.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[2]/div[1]/a/@href')
url_total.extend(movie_url)
def information_get(url):
response = requests.get(url=url, headers=headers).text
et = etree.HTML(response)
movie_name = et.xpath('//div[@id="content"]//h1//text()')[1]
first_run_time = et.xpath('//div[@id="content"]//h1//text()')[3]
movie_photo = et.xpath('//div[@id="mainpic"]//a/img/@src')[0]
movie_type = et.xpath('//div[@id="info"]//span[@property="v:genre"]/text()')
movie_country = et.xpath('//span[@class="pl" and contains(text(), "制片国家/地区:")]/following-sibling::text()[1]')[0]
movie_language = et.xpath('//span[@class="pl" and contains(text(), "语言:")]/following-sibling::text()[1]')[0]
movie_director = et.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')[0]
movie_scriptwriter = et.xpath('//*[@id="info"]/span[2]/span[2]/a/text()')
movie_actor = et.xpath('//*[@id="info"]/span[3]/span[2]//a//text()')
movie_time = et.xpath('//*[@id="info"]//span[@property = "v:runtime"]/text()')[0]
movie_score = et.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/strong/text()')[0]
movie_score_number = et.xpath('//*[@id="interest_sectl"]/div[1]/div[2]/div/div[2]/a/span/text()')
if [p.strip() for p in et.xpath('//*[@id="link-report-intra"]/span[2]/text()[1]')]:
movie_introduction = [p.strip() for p in et.xpath('//*[@id="link-report-intra"]/span[2]//text()')]
else:
movie_introduction = [p.strip() for p in et.xpath('//*[@id="link-report-intra"]/span[1]//text()')]
movie_pre_vue = et.xpath('//*[@id="related-pic"]/ul/li[1]/a/@href')[0]
movie_comment = et.xpath('//*[@id="hot-comments"]/div/div/p/span/text()')
movie_reward = et.xpath('//*[@id="content"]/div[2]/div[1]/div[8]/ul/li[2]/text()')
if not movie_reward:
movie_reward = 'null'
data = {'movie_name': movie_name, 'first_run_time': first_run_time,
'movie_type': movie_type, 'movie_country': movie_country, 'movie_language': movie_language,
'movie_director': movie_director,
'movie_scriptwriter': movie_scriptwriter, 'movie_actor': movie_actor, 'movie_time': movie_time,
'movie_introduction': movie_introduction, 'movie_comment': movie_comment, 'movie_score': movie_score,
'movie_score_number': movie_score_number, 'movie_reward': movie_reward, 'movie_photo': movie_photo,
'movie_url': url,
'movie_pre_vue': movie_pre_vue
}
yield data
def write_to_file(content):
file_name = 'movie2.csv'
with open(file_name, 'a', newline='', encoding='utf-8') as f:
writer = csv.writer(f)
for i in content:
writer.writerow(i.values())
if __name__ == '__main__':
for i in range(10):
url_page = 'https://movie.douban.com/top250?start=' + str(i * 25) + '&filter='
url_get(url_page)
num = 1
for url in url_total:
content = information_get(url)
write_to_file(content)
print(str(num * 100 / 250) + '%')
num += 1