import requests from urllib import response import lxml.html import csv from requests import Response doubanurl = 'https://movie.douban.com/top250?start={}&filter=' def getSource(url): #获取目标网页 response = requests.get(url) response.encoding = 'utf-8' return response.content def getEveryItem(source): selector = lxml.html.document_fromstring(source) movieItemList = selector.xpath('//div[@class="info"]') movieList = [] for eachMovie in movieList: movieDict = {} title = eachMovie.xpath('div[@class="hd"]/a/span[@class="title"]/text()') print(title) otherTitle = eachMovie.xpath('div[@class="hd"]/a/span[@class="other"]/text()') link = eachMovie.xpath('div[@class="hd"]/a/@href')[0] star = eachMovie.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()') quote = eachMovie.xpath('div[@class="bd"]/p[@class="quote"]/span/text()') movieDict['title'] = ''.join(title+otherTitle) movieDict['url'] = link movieDict['star'] = star movieDict['quote'] = quote print(movieDict) movieList.append(movieDict) return movieList def writeData(movieList): with open('./DouBanMovie.csv','w',encoding='UTF-8')as f: writer = csv.DictWriter(f,fieldnames=['title','star','quote','url']) writer.writeheader() for each in movieList: writer.writerow(each) if __name__ == '__main__': movieList = [] for i in range(10): pageLink = doubanurl.format(i*25) print(pageLink) source = getSource(pageLink) movieList +=getEveryItem(source) print(movieList[:10]) writeData(movieList)
爬去网页时出现raise etree.ParserError(lxml.etree.ParserError: Document is empty问题,想知道哪里出现了错误源代码如下
于 2023-11-17 08:28:43 首次发布