beautifulsoup实现爬取豆瓣电影T250页面数据
爬虫入门初级练习,爬取豆瓣电影Top250页面数据并存入MongoDB。有错误之处欢迎指正。
// author:luyabala
import urllib
import requests
import re
import pymongo
from bs4 import BeautifulSoup
def html_download(pagenum):
#url格式一定要对
url = 'https://movie.douban.com/top250?start='+ str(pagenum*25) + '&filter='
print(url)
headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
'Chrome/51.0.2704.63 Safari/537.36'}
request = urllib.request.Request(url, headers=headers2)
answer = urllib.request.urlopen(request)
html_text = answer.read()
data = html_text.decode('utf-8')
soup = BeautifulSoup(data, 'html.parser')
return soup
def connect_db():
client = pymongo.MongoClient("localhost", 27017) # 创建连接,因为用的本机的mongodb数据库,所以直接写localhost即可,也可以写成127.0.0.1,27017为端口
db = client['mydb'] # 连接的数据库
collection = db['douban_film250'] # 连接的表
return collection
def save_info(srchnum):
con=connect_db()
for i in range(srchnum): # i<srchnum
print(i)
html_soup = html_download(i)
items = html_soup.find('ol', {'class': 'grid_view'}).find_all('li')
# print(type(items))
for item in items:
title = item.find('div', {'class': 'hd'}).find('a').get_text(' ', '\n').strip()
# print(title)
link = item.find('div', {'class': 'hd'}).find('a').get('href')
# print(link)
kind = item.find('div', {'class': 'bd'}).find('p').get_text(' ', '<br/>').strip()
# 正则匹配get_text('\n','<br/>').strip() 去掉空格与br
# print(kind)
# 类型为resultset,没有属性get_text()
span_list = item.find('div', {'class': 'star'}).find_all('span')
score = span_list[1].get_text()
# print(score)
# 或者span_list[3].get_text()
pplnum = item.find('div', {'class': 'star'}).find(text=re.compile('评价'))
comment = item.find('span', {'class': 'inq'})
data = {}
data['title'] = title
data['link'] = link
data['kind'] = kind
# print(data['kind'])
data['score'] = score
data['people'] = pplnum
#某些电影没有一句话影评,如果不做判断,会报错nonetype无属性get_text()
if comment:
data['comment'] = comment.get_text()
con.insert_one(data)
save_info(10) #25*10