用宽度优先搜索对豆瓣【2016年国内公映电影排期】
https://www.douban.com/doulist/3516235/?start=0&sort=seq&sub_type=
进行爬取,总体没太大难度,需要注意的是对重复链接的去重
第一次的程序
# -*- coding: utf-8 -*-
"""
Created on Tue May 29 10:38:47 2018
@author: phl
"""
import basicSpider
import re
from bs4 import BeautifulSoup
#取得页面源码信息
def get_html(url):
headers = {("User-Agent",'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0')}
proxy = {"http":"139.199.230.242:1080"}
html = basicSpider.downloadHtml(url, headers=headers)
return html
#取得电影div元素列表
def get_movie_all(html):
soup = BeautifulSoup(html, "html.parser")
movie_list = soup.find_all('div', class_='bd doulist-subject')
return movie_list
#对每个电影div进行解析
def get_movie_one(movie):
# print('============================================')
result = ""
soup = BeautifulSoup(str(movie), "html.parser")
#电影名的获取
title = soup.find_all('div', class_='title')
# print('title=',title)
soup_title = BeautifulSoup(str(title[0]), 'html.parser')
# print('soup_title=',soup_title)
for line in soup_title.stripped_strings:
result += line
#评分获取,需要考虑没有评分的情况
try:
sorce = soup.find_all('span', class_='rating_nums')
sorce_ = BeautifulSoup(str(sorce[0]), 'html.parser')
for line in sorce_.stripped_strings:
result += "|| 评分:"
result += line
except Exception as e:
result += "|| 评分: 5.0"
#简要信息获取
abstract = soup.find_all('div', class_='abstract')
abstract_info = BeautifulSoup(str(abstract[0]), 'html.parser')
for line in abstract_info.stripped_strings:
result += "||"
result += line
# print(result)
return result
# 保存到doubanMovie.txt中
def save_file(movieInfo):
with open('doubanMovie.txt', "a", encoding='utf-8') as f:
try:
f.write(movieInfo + '\n')
except UnicodeEncodeError as e:
print(movieInfo)
raise(e)
# 对豆瓣某个页面进行爬取
def CrawlMovieInfo(url):
global crawl_queue
global crawled_queue
html = get_html(url)
pat