【静态页面入门练习】beautifulsoup实现爬取豆瓣电影T250页面数据

最新推荐文章于 2021-10-26 14:45:50 发布

luyabala

最新推荐文章于 2021-10-26 14:45:50 发布

阅读量363

点赞数 1

分类专栏：爬虫文章标签： python 爬虫

本文链接：https://blog.csdn.net/qq_42840080/article/details/88719161

版权

爬虫专栏收录该内容

1 篇文章 0 订阅

订阅专栏

beautifulsoup实现爬取豆瓣电影T250页面数据

爬虫入门初级练习，爬取豆瓣电影Top250页面数据并存入MongoDB。有错误之处欢迎指正。

// author：luyabala

import urllib
import requests
import re
import pymongo
from bs4 import BeautifulSoup


def html_download(pagenum):
	#url格式一定要对
    url = 'https://movie.douban.com/top250?start='+ str(pagenum*25) + '&filter='
    print(url)
    headers2 = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko)'
                'Chrome/51.0.2704.63 Safari/537.36'}
    request = urllib.request.Request(url, headers=headers2)
    answer = urllib.request.urlopen(request)
    html_text = answer.read()
    data = html_text.decode('utf-8')
    soup = BeautifulSoup(data, 'html.parser')
    return soup

def connect_db():
    client = pymongo.MongoClient("localhost", 27017)  # 创建连接，因为用的本机的mongodb数据库，所以直接写localhost即可，也可以写成127.0.0.1，27017为端口
    db = client['mydb']  # 连接的数据库
    collection = db['douban_film250']  # 连接的表
    return collection

def save_info(srchnum):
    con=connect_db()
    for i in range(srchnum):  # i<srchnum
        print(i)
        html_soup = html_download(i)
        items = html_soup.find('ol', {'class': 'grid_view'}).find_all('li')
        # print(type(items))
        for item in items:
            title = item.find('div', {'class': 'hd'}).find('a').get_text(' ', '\n').strip()
            # print(title)
            link = item.find('div', {'class': 'hd'}).find('a').get('href')
            # print(link)
            kind = item.find('div', {'class': 'bd'}).find('p').get_text(' ', '<br/>').strip()
            # 正则匹配get_text('\n','<br/>').strip() 去掉空格与br
            # print(kind)
            # 类型为resultset，没有属性get_text()
            span_list = item.find('div', {'class': 'star'}).find_all('span')  
            score = span_list[1].get_text()
            # print(score)
            # 或者span_list[3].get_text()
            pplnum = item.find('div', {'class': 'star'}).find(text=re.compile('评价'))
            comment = item.find('span', {'class': 'inq'})

            data = {}
            data['title'] = title
            data['link'] = link
            data['kind'] = kind
            # print(data['kind'])
            data['score'] = score
            data['people'] = pplnum
            #某些电影没有一句话影评，如果不做判断，会报错nonetype无属性get_text()
            if comment:
                data['comment'] = comment.get_text()
            con.insert_one(data)

save_info(10)   #25*10