# 1-数据 -- requests """ 豆瓣指定电影短评---10页 1-下载页面 2-检索数据 3-数据存储 """ import requests from lxml import html def download(url): code = requests.get(url).text # str code = html.fromstring(code) return code def getvalues(code): # 20 个短评 text_list = [] div_list = code.xpath('//div[@class="comment-item"]') for div in div_list: text = div.xpath('div[@class="comment"]/p/span/text()') text_list.append(text[0]) return text_list if __name__ == "__main__": pages = 10 # 下载短评10页 all_text = [] # 存储10页短评信息 for page in range(pages): url = "https://movie.douban.com/subject/30334073/comments?start={}&limit=20&sort=new_score&status=P".format(page * 20) # 1-下载 code = download(url) # 2-检索内容 - 一页短评 text_list = getvalues(code) # 当前页的文本 all_text.extend(text_list) with open("source\\豆瓣短评.txt", "w", encoding="utf-8") as f: for text in all_text: f.write(text + "\n")
Python+requests+lxml爬取豆瓣电影短评
最新推荐文章于 2020-12-30 07:26:38 发布