爬取页面:
【2016年国内公映电影排期】
#!/usr/bin/env python
# -*- coding=utf-8 -*-
import urllib.request
import re
import time
import os
from bs4 import BeautifulSoup
def get_html(url): #通过url获取网页内容
result = urllib.request.urlopen(url)
return result.read()
# save_file(result.read(), 'thefile.txt')
def get_movie_all(html): #通过soup提取到每个电影的全部信息,以list返回
soup = BeautifulSoup(html,'html.parser')
movie_list = soup.find_all('div', class_='bd doulist-subject')
return movie_list
def get_one_movie(movie):
result = [] # 用于存储提取出来的电影信息
soup_all = BeautifulSoup(str(movie), 'html.parser')
title = soup_all.find_all('div', class_='title')
soup_title = BeautifulSoup(str(title[0]), 'html.parser')
for line in soup_title.stripped_strings: # 对获取到的<a>里的内容进行提取
result.append(line)
# num = soup_all.find_all('span', class_='rating_nums')
# 加入电影评分
num = soup_all.find_all('span')
result.append(num[1].string)
# 加入abstract
info = soup_all.find_all('div', class_='abstract')
soup_info = BeautifulSoup(str(info[0]), 'html.parser')
result_str = ""
for line in soup_info.stripped_strings: # 对获取到的<div>里的内容进行提取
result_str = result_str +" "+ line
result.append(result_str)
return result # 返回获取到的结果
def save(text,file_name):
with open(file_name,'ab') as f:
texts = str.encode(text)
f.write(texts)
if __name__=='__main__':
url = 'https://www.douban.com/doulist/3516235/?start=0&sort=seq&sub_type='
html = get_html(url)
movie_list = get_movie_all(html)
for movie in movie_list:
result = get_one_movie(movie)
text = '电影名:'+result[0]+' '+'评分:'+result[1]+' '+result[2]+'\n'
save(text,'movie.txt')
只爬取了第一页的内容,参考这位大神的代码
毕竟小白开始学习是要从模仿开始的嘛~~思路懂了又自己敲了一遍。
慢慢来吧,相信自己不是废物┭┮﹏┭┮
相关待看
豆瓣电影TOP250爬取