一:简介
通过抓取豆瓣电影Top250的数据,分别进行了三个数据统计,分别是:上榜的电影上映的年份,该年份总共上榜的电影数量,数量为0的就没有统计了;各个国家地区出品的电影数量;250部电影的各个类型标签的数量。
二:源代码
#coding=utf-8
import requests
from bs4 import BeautifulSoup
import os, socket, re
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
class Spider:
def __init__(self, url='https://movie.douban.com/top250'):
self.url = url
self.header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36"
}
def mkdir(self, path):
path = path.strip()
isExists = os.path.exists(os.path.join("D:\mdouban", path))
if not isExists:
os.makedirs(os.path.join("D:\mdouban", path))
os.chdir(os.path.join("D:\mdouban", path))
else:
os.chdir(os.path.join("D:\mdouban", path))
return os.path.abspath('.')
#获取BeautifulSoup
def get_soup(self, link):
html = requests.get(link, headers=self.header)
html.encoding = html.apparent_encoding
soup = BeautifulSoup(html.text, 'lxml')
return soup
if __name__ == '__main__':
socket.setdefaulttimeout(20)
spider = Spider()
path = spider.mkdir('top250')
print('starting get data from douban...')
def autolabel(rects, ax, xpos='center'): #设置显示每一个条形图的值
"""
Attach a text label above each bar in *rects*, displaying its height.