from bs4 import BeautifulSoup
import requests, time, pymongo
from multiprocessing import Pool
user_agent = 'Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'
headers ={'User-Agent':user_agent}
proxies = {'http':'60.178.173.125'}
#创建数据库
client = pymongo.MongoClient('localhost', 27017) #激活客户端
Douban = client['douban'] #创建数据库名称
url_list = Douban['url_list'] #创建表用于存放每个电影的链接
item_list = Douban['item_list'] #创建表用于放每个电影的详细信息
#获取250个电影的列表页,一页有25个电影,所以是10页
start_url = ['https://movie.douban.com/top250?start={}&filter='.format(str(i)) for i in range(0, 250, 25)]
url0 = 'https://movie.douban.com/top250?start=0&filter='
#获取某一个电影详情信息
url_one = 'https://movie.douban.com/subject/6146955/'
url_two = 'https://movie.douban.com/subject/1291546/'
url_404 = 'https://movie.douban.com/subject/5912992/'
link_error = [] #用于存放
def get_movie_info(url):
wb_data = requests.get(url,headers = headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
time.sleep(4)
if (soup.find('title').text == "页面不存在"): #如果详情页不存在就跳过(其实应该在导航页爬取,这样排名会完整,只不过缺失一些信息
link_error.append(url)
pass
else:
genres = soup.find_all(property = "v:genre") #类型(类型会有多个,是一个列表)
runtime = soup.find(property = "v:runtime") #时长
ytime = soup.find(property = "v:initialReleaseDate") #年份(只要年份不要月)
movie = {
'number': soup.select('div.top250 > span.top250-no')[0].string,
'title': soup.find(property="v:itemreviewed").string, # 定位电影名
'score': soup.find(property = "v:average").string, #定位电影评分
'comments_count': soup.find(property = "v:votes").string, #定位评论数量
'genre': list(i.string for i in genres),
'runtime': soup.find(property = "v:runtime").string if soup.find_all('span','v:runtime') else None,
'year':ytime.string[0:4],
'url': url
}
#有些runtime没有属性,特殊处理
if (movie['runtime'] == None):
# item_list.insert_one(movie) #插入数据库中
s = soup.find_all(id = 'info')[0].get_text(strip = True)
# print(s.find("分"))
k = s.find("分")
j = s.find("片长")
# print(s[j+3:k+1])
# print(movie['runtime'])
movie['runtime'] = s[j+3:k+1]
print(movie)
item_list.insert_one(movie) #插入数据库中
# get_movie_info(url_one)
#获取每一页的电影链接
def get_movie_link(url):
wb_data = requests.get(url,headers=headers)
soup = BeautifulSoup(wb_data.text, 'lxml')
links = soup.select('div > div.info > div.hd > a ')
for i in links:
item_link = i.get('href')
url_list.insert_one({'url':item_link}) #将电影链接放入数据库中
print(item_link)
#将所有页里面的链接放入links4
# for i in start_url:
# get_movie_link(i)
#
#将数据库中存取的电影链接提取给get_movie_info,进行详情页的爬取
# for i in url_list.find(no_cursor_timeout = True): #默认MongoDB过十分钟还没有处理完毕会异常,加参数永不停,要手动关闭
# # print(i['url'])
# item_url = i['url']
# get_movie_info(item_url)
#断点设计
db_urls = [item['url'] for item in url_list.find()] #找到总共的url
index_urls = [item['url'] for item in item_list.find()] #已经抓取的url
x = set(db_urls)
y = set(index_urls)
rest_of_urls = x-y
# for i in rest_of_urls:
# get_movie_info(i)
print(rest_of_urls)
print(len(link_error))
# #数据库操作
# # print(url_list.find()['url']) #总数
# print(url_list.count())