python日常—爬取豆瓣250条电影记录


import
requests import lxml.html,csv doubanUrl = 'https://movie.douban.com/top250?start={}&filter=' def getSource(doubanUrl): response = requests.get(doubanUrl) response.encoding = 'utf-8' return response.content def getEveryItem(source): selector = lxml.html.document_fromstring(source) movieItemList = selector.xpath('//div[@class="info"]') movieList = [] for eachMovie in movieItemList: movieDict = {} title = eachMovie.xpath('div[@class="hd"/a/span/[@class="title"]/text()') otherTitle = eachMovie.xpath('div[@class="hd"/a/span/[@class="other"]/text()') link = eachMovie.xpath('div[@class="hd"/a/@href')[0] star = eachMovie.xpath('div[@class="hd"/div[@class="star"]/span[@class="rating_num"]/text()') quote = eachMovie.xpath('div[@class="hd"/p[@class="quote"]/span/text()') movieDict['title'] = ''.join(title+otherTitle) movieDict['url'] = link movieDict['star'] = star movieDict['quote'] = quote movieList.append(movieDict) return movieList def writeData(movieList): with open('./Douban.csv','w',encoding='UTF-8',newline='') as f: writer = csv.DictWriter(f,fieldnames=['title','star','quote','url']) writer.writeheader() for each in movieList: writer.writerow(each) if __name__ == 'main': movieList = [] for i in range(10): pageLink = doubanUrl.format(i*25) print(pageLink) source = getSource(pageLink) movieList = getEveryItem(source) print(movieList[:10]) writeData(movieList)

转载于:https://www.cnblogs.com/zxycb/p/9823311.html

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值