import requests
from lxml import etree
import csv
doubanUrl = 'https://movie.douban.com/top250?start={}&filter='
def getSource(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.36'
}
response = requests.get(url,headers=headers)
response.encoding = 'utf-8'
return response.text
def getEveryItem(source):
html_element = etree.HTML(source)
movieItemList = html_element.xpath('//div[@class="info"]')
movieList = []
for eachMoive in movieItemList:
movieDict = {}
title = eachMoive.xpath('div[@class="hd"]/a/span[@class="title"]/text()')
otherTitle = eachMoive.xpath('div[@class="hd"]/a/span[@class="other"]/text()')
link = eachMoive.xpath('div[@class="hd"]/a/@href')[0]
star = eachMoive.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')[0]
quote = eachMoive.xpath('div[@class="bd"]/p[@class="quote"]/span/text()')
if quote:
quote = quote[0]
else:
quote = ''
movieDict['title'] = ''.join(title + otherTitle)
movieDict['url'] = link
movieDict['star'] = star
movieDict['quote'] = quote
movieList.append(movieDict)
print(movieList)
return movieList
def writeData(movieList):
with open('doubanmv.csv','w',encoding='utf-8',newline='') as f:
writer = csv.DictWriter(f,fieldnames=['title','star','quote','url'])
writer.writeheader()
for each in movieList:
writer.writerow(each)
if __name__ == '__main__':
movieList = []
for i in range(10):
pageLink = doubanUrl.format(i * 25)
source = getSource(pageLink)
movieList += getEveryItem(source)
writeData(movieList)
然后以excel的格式保存接下来就是如下结果
import requests
import csv
from lxml import etree
class A:
def __init__(self):
self.header = ('title','forward','point','writer')
self.url = 'https://movie.douban.com/top250?start={}&filter='
self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.57'}
def getsource(self,url):
response = requests.get(url,headers=self.headers)
html = response.content.decode('utf-8')
return html
def getlocal(self,html):
html_element = etree.HTML(html)
movielist = html_element.xpath("//div[@class='info']")
list1 = []
for x in movielist:
dict1 = {}
title = x.xpath("div[@class='hd']/a/span[@class='title']/text()")[0]
forward = x.xpath("div[@class='bd']/p/span[@class='inq']/text()")
point = x.xpath("div[@class='bd']/div[@class='star']/span[@class='rating_num']/text()")[0]
writer = x.xpath("div[@class='bd']/p[@class]/text()")[0]
if forward:
forward = forward[0]
else:
forward = ''
dict1['title'] = title
dict1['forward'] = forward
dict1['point'] = point
dict1['writer'] = writer
list1.append(dict1)
return list1
def writer(self,list1):
with open('top250.csv', 'w', encoding='utf-8', newline='') as f:
Dwriter = csv.DictWriter(f,self.header)
Dwriter.writeheader()
Dwriter.writerows(list1)
def main(self):
list2 = []
for i in range(10):
url = self.url.format(i*25)
rem = self.getsource(url)
rep = self.getlocal(rem)
list2 = list2 + rep
self.writer(list2)
if __name__ == '__main__':
sper = A()
sper.main()