import requests
import lxml.html
from bs4 import BeautifulSoup
import re
import bs4
from pymongo import MongoClient
def req(url, param):
resp = requests.get(url, params=param).text
return resp
def get_data(data):
#得到你要抓取内容然块
source_soup = BeautifulSoup(data, 'html.parser')
data_ol = source_soup.ol
films = []
for tag_li in data_ol:
if isinstance(tag_li, bs4.element.Tag):
datas = lxml.html.fromstring(str(tag_li.contents))
#得到电影名字
names = []
name1 = datas.xpath('//span[@class="title"]/text()')
name2 = datas.xpath('//span[@class="other"]/text()')
names.append(name1)
names.append(name2)
#得到电影导演及主演的信息
info = datas.xpath('//p[@class=""]/text()')
#得到电影的评分及评分人数
star = datas.xpath('//span[@class="rating_num"]/text()')
num = re.search('<span>(.*)</span>', str(data_ol.contents)).group(1)
#得到电影的名句
quote = datas.xpath('//span[@class="inq"]/text()')
#将信息存入一个字典
film_info = {
'name': names,
'info': info,
'star': star,
'num': num,
'quote': quote
}
films.append(film_info)
return films
cli = MongoClient('localhost', 27017)
db = cli.films
for i in range(1, 11):
param = {
'start': (i - 1) * 25,
'filter': ""
}
url = 'https://movie.douban.com/top250'
db.films2.insert(get_data(req(url, param)))
print("spider success")
使用bs4, lxml.html.xpath, requests
还请各位看客多多指教,