我们需要获取的信息包括:电影名称,别称,评分,以及电影的核心主题。
通过对原网页的信息分析,获得以下内容:
网页分析:
对比相关网页区别后,对url的获取进行设计:
url = self.url.format(i * 25)
路径分析:
1.电影名称的
li_name = li.xpath('.//div[@class="hd"]/a/span[@class="title"]/text()')
2.别称的
li_othername = strip(li.xpath('.//div[@class="hd"]/a/span[@class="other"]/text()'))
3. 电影评分
# 电影评分xpath
li_info = li.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()')
4.电影评论
# 电影评论xpath
li_total = li.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()')
实现代码:
导入包:
from numpy.core.defchararray import strip
from user_agent import generate_user_agent
from lxml import html
import requests
import random
import time
核心函数:
def get_info(self, url):
rep = requests.get(url=url, headers={"User-Agent": generate_user_agent()}).text
# 开始解析,把lxml+Xpath表达式,将结果存入li_lists
p = html.etree.HTML(rep)
# li_lists是xpath匹配内容的结果集,保存了符合规则的信息
li_lists = p.xpath("//div[@class='info']")
items = {}
for li in li_lists:
# 电影名xpath,要判断是否为空,因为后续可能会做数据持久化处理,比如存入数据库
# othername xpath
# 电影评分xpath
# 电影评论xpath
设置入口函数:
def run(self):
for i in range(0, 10):
# 将页码数嵌入url中
url = self.url.format(i * 25)
# 调用主方法
self.get_info(url)
# 设置间隔,休眠0或1秒,目的是反爬
time.sleep(random.randint(0, 1))
运行函数:
if __name__ == "__main__":
spider = DouBanSpider()
spider.run()
整体代码:
from numpy.core.defchararray import strip from user_agent import generate_user_agent from lxml import html import requests import random import time class DouBanSpider(object): # 初始化url def __init__(self): self.url = "https://movie.douban.com/top250?start={}&filter=" # 核心函数 def get_info(self, url): rep = requests.get(url=url, headers={"User-Agent": generate_user_agent()}).text # 开始解析,把lxml+Xpath表达式,将结果存入li_lists p = html.etree.HTML(rep) # li_lists是xpath匹配内容的结果集,保存了符合规则的信息 li_lists = p.xpath("//div[@class='info']") items = {} for li in li_lists: # 电影名xpath,要判断是否为空,因为后续可能会做数据持久化处理,比如存入数据库 li_name = li.xpath('.//div[@class="hd"]/a/span[@class="title"]/text()') if li_name: items["name"] = li_name[0] else: items["name"] = None # othername xpath li_othername = strip(li.xpath('.//div[@class="hd"]/a/span[@class="other"]/text()')) if li_othername: items["others"] = li_othername[0] else: items["others"] = None # 电影评分xpath li_info = li.xpath('.//div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()') if li_info: items["star"] = li_info[0] else: items["star"] = None # 电影评论xpath li_total = li.xpath('.//div[@class="bd"]/p[@class="quote"]/span/text()') if li_total: items["total"] = li_total[0] else: items["total"] = None print(items) # 入口函数 def run(self): for i in range(0, 10): # 将页码数嵌入url中 url = self.url.format(i * 25) # 调用主方法 self.get_info(url) # 设置间隔,休眠0或1秒,目的是反爬 time.sleep(random.randint(0, 1)) if __name__ == "__main__": spider = DouBanSpider() spider.run()
实现效果: