导入模块
import requests
from lxml import etree
import csv
主题代码
class Douban:
'''
爬取豆瓣top250,并写入csv 文件
'''
def __init__(self):
self.starturl = 'https://movie.douban.com/top250?start={}&filter='
def run(self):
movielist = []
for i in range(10):
pageurl = self.starturl.format(i*25)
# print(pageurl)
text=self.get_text(pageurl)
# print(text)
movielist += self.analysis_text(text)
# print(movielist)
self.writeData(movielist)
def get_text(self,url):
header = {
"User-Agent" :"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"
}
response = requests.get(url,headers=header)
response.encoding="utf-8"
return response.text
def analysis_text(self,text):
element=etree.HTML(text)
movieItemlist = element.xpath('//div[@class="info"]')
# print(movieItemlist)
movielist = []
for eachmovie in movieItemlist:
movieDict = {}
title = eachmovie.xpath('./div[@class="hd"]/a/span[@class="title"]/text()')
# print(title)
move_link = eachmovie.xpath('div[@class="hd"]/a/@href')
othertitle = eachmovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()')
movieDict['title']=''.join(title + othertitle)
movieDict['url'] = move_link
movielist.append(movieDict)
return movielist
def writeData(self,movielist):
with open('douban.csv','w',encoding='utf-8',newline='')as f:
writer = csv.DictWriter(f,fieldnames=['title','url'])
writer.writeheader() #写入表头
for each in movielist:
writer.writerow(each)
if __name__ == '__main__':
db = Douban()
db.run()
**第一次写博客,多多指教**