闲得无聊去爬了一下豆瓣的Top250电影信息
代码如下,直接可以运行,
import requests
from lxml import etree
from bs4 import BeautifulSoup
s = requests.Session()
for id in range(0, 251, 25):
url = 'https://movie.douban.com/top250/?start=' + str(id)+ '&filter='
r = s.get(url)
r.encoding = 'utf-8'
# print(r.content)
root = etree.HTML(r.content) #解析为HTML文档
items = root.xpath('//ol/li/div[@class="item"]') #主要信息的xpath路径
for item in items:
title = item.xpath('./div[@class="info"]//a/span[@class="title"]/text()')
name = title[0].encode('gb2312', 'ignore').decode('gb2312')
try: #为了解决中文名,不合编码continue跳过
english = title[1]
except:
continue
other = item.xpath('./div[@class="info"]//a/span[@class="other"]/text()')
author = other[0].encode('gb2312', 'ignore').decode('gb2312')
# rank = item.xpath('./div[@class="pic"]/em/text()')[0]
rating = item.xpath('.//div[@class="bd"]//span[@class="rating_num"]/text()')[0]
direct = item.xpath('./div[@class="info"]/div[2]/p/text()')[0]
direcs = str(direct)
dire = direcs.lstrip()
print(name,english,author,rating)
print(dire)
print('\n')