5分钟使用Python爬取豆瓣TOP250电影榜

10 篇文章 0 订阅

流程:

使用requests爬取网页
使用BeautifulSoup实现数据解析
借助pandas将数据写出到Excel

import requests
from bs4 import BeautifulSoup
import pprint
import json
import time
import os
import numpy as np
from sqlalchemy import create_engine


def parse_single_url(html):

    '''html:一个url产生25条记录'''
    soup = BeautifulSoup(html,'html.parser')

    items = (
    soup.find('div',class_ = 'article')
        .find('ol',class_ = 'grid_view')
        .find_all('div',class_ = 'item')
    )

    '''需放在循环之外'''
    data1 = [] 

    for item in items:

        '''提取需要的字段'''
        rank1 = item.find('div',class_ = 'pic').find('em').get_text()
        info1 = item.find('div',class_= 'info')
        url1 = info1.find('div',class_ = 'hd').find('a').get('href')
        title1 = info1.find('div',class_ = 'hd').find('span',class_ = 'title').get_text()

        stars1 = (
        info1.find('div',class_ = 'bd')
            .find('div',class_ = 'star')
            .find_all('span')    
        )
        rating_star1 = stars1[0].get('class')[0] # 提取class属性值
        rating_num1 = stars1[1].get_text()
        comments1 = stars1[3].get_text()
        
        If_None = info1.find('div',class_ = 'bd').find('p',class_ = 'quote')
        
        '''if...else:解决AttributeError: 'NoneType' object has no attribute 'find'的报错问题 '''
        if If_None != None:
            quote1 = If_None.find('span',class_ = 'inq').get_text() # 提取文本
        else:
            quote1 = '缺失'
            
        '''
          将每条记录以字典形式存储,并追加到空列表中
          data1 = [] ,放在循环内部,每一遍都会被初始化
          print(tuple([rank1,title1,rating_star1,rating_num1,quote1,comments1,url1]))
          
        '''
        
        '''打印明细,便于定位问题:'''
        # print(tuple([rank1,title1,rating_star1,rating_num1,quote1,comments1,url1]))
        
        data1.append({
            'rank': int(rank1),
            'title':title1,
            'rating_star':int(rating_star1.replace('rating','').replace('-t','')),
            'rating_num':np.float(rating_num1),
            'quote':quote1.replace('。',''),
            'comment':int(comments1.replace('人评价','')),
            'url':url1   
        })
        
    '''缩进须对齐'''    
    return data1

# parse_single_url(html)

def download_all_htmls():
    htmls = []
    for idx in list(range(0,250,25)):
        url = f'https://movie.douban.com/top250?start={idx}&filter='
#         print('craw_url:',url)
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/103.0.0.0 Safari/537.36'}
        r = requests.get(url,headers = headers)
        if r.status_code != 200:
            raise Exception('error')
        '''将html文本内容追加存储在列表中'''
        htmls.append(r.text)
    return htmls

def data_to_Mysql(data,db_tname):
    
    engine = create_engine(f'mysql://test:test$@118.123.201.131:3306/test?charset=utf8')
    Records = data.to_sql(db_tname, engine, index=False, if_exists='replace')
    
    print(f'{db_tname}数据入库成功,共计导入数据{Records}条')

if __name__ == '__main__':
    htmls = download_all_htmls()
    
    all_data = []
    for html in htmls:
        all_data.extend(parse_single_url(html))
        time.sleep(0.5)
    df = pd.DataFrame(all_data)
    
    '''存储数据到desktop'''
    os.chdir(r'C:\Users\DELL\Desktop')
    df.to_excel('./top250.xlsx',index=False)
    
     '''存储到数据库'''
    data_to_Mysql(df,'movie_top_250')

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值