python 爬取豆瓣电影排行榜TOP250 保存到数据库或文件

 本篇文章爬取豆瓣电影排行榜,使用的模块:第三方模块requests,正则模块,数据库MySQL等

import requests
import re
import pymysql

class DouBan:
    def __init__(self):
        self.baseurl = 'https://movie.douban.com/top250?start={}&filter='
        #数据库相关的参数
        self.host = 'localhost'
        self.user = 'root'
        self.pwd = '123456'
        self.conn = pymysql.connect(self.host, self.user, self.pwd)
        self.cur = self.conn.cursor()

    def getHTml(self,url):
        res = requests.get(url)
        res.encoding='utf-8'
        html = res.text
        self.parseHtml(html)

    def parseHtml(self,html):
        reg = '<div class="info">[\s\S]*?<span class="title">([\s\S]*?)</span>[\s\S]*?<br>([\s\S]*?)&nbsp;/&nbsp;([\s\S]*?)&nbsp[\s\S]*?<span class="rating_num" property="v:average">([\s\S]*?)</span>'
        p = re.compile(reg)
        infolist = p.findall(html)
        for info in infolist:
            each_info=[]
            for x in info:
                x = x.strip()
                each_info.append(x)
            self.writeComment(each_info)

#--------------------保存到本地-----------------------
    def writeComment(self,info):
        try:
            with open('豆瓣.txt','a') as f:
                for each in info:
                    f.write(each+'     ')
                f.write('\n')
        except:
            print('打开文件错误')
#------------------保存到数据库mysql------------------
    # def writeComment(self,info):
    #     self.cur.execute('create database if not exists maoyan;')
    #     self.cur.execute('use maoyan')
    #     self.cur.execute('create table if not exists maoyan(id int primary key auto_increment,moviename varchar(60),date char(100),country varchar(20),score varchar(10))default charset="utf8";')
    #     self.cur.execute('insert into maoyan(moviename,date,country,score) values("%s","%s","%s","%s");'%(info[0],info[1],info[2],info[3]))
    #     self.conn.commit()
    #
    def main(self):
        # self.cur.execute('drop database maoyan;')
        for i in range(0,10):
            url = self.baseurl.format(i*25)
            print(url)
            self.getHTml(url)
        print('写入完成')


if __name__=='__main__':
    douban = DouBan()
    douban.main()

 

  • 0
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
以下是Python爬取豆瓣Top250电影并保存MySQL数据库的代码: ``` import requests from bs4 import BeautifulSoup import pymysql # 连接MySQL数据库 db = pymysql.connect(host='localhost', port=3306, user='root', password='your_password', db='your_database') cursor = db.cursor() # 创建表格 cursor.execute('CREATE TABLE IF NOT EXISTS top250(' 'id INT PRIMARY KEY AUTO_INCREMENT,' 'title VARCHAR(255) NOT NULL,' 'score FLOAT(3,1) NOT NULL,' 'director VARCHAR(255) NOT NULL,' 'actor VARCHAR(255) NOT NULL,' 'year VARCHAR(4) NOT NULL)') # 爬取Top250电影 url = 'https://movie.douban.com/top250' headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} response = requests.get(url, headers=headers) soup = BeautifulSoup(response.text, 'html.parser') for item in soup.find_all('div', class_='info'): # 解析出电影名、评分、导演、演员、年份信息 title = item.find('span', class_='title').string score = item.find('span', class_='rating_num').string director = item.find('p').text.split('\n')[1].strip().strip('导演: ') actor = item.find('p').text.split('\n')[2].strip().strip('主演: ') year = item.find('p',class_='').text.split('\n')[1].strip().split('/')[0] # 数据库写入 cursor.execute("INSERT INTO top250(title, score, director, actor, year) VALUES (%s, %s, %s, %s, %s)", (title, score, director, actor, year)) db.commit() # 关闭数据库连接 db.close() ``` 这段代码可以将豆瓣Top250电影的电影名、评分、导演、演员、年份信息爬取下来,并保存到本地的MySQL数据库中。同时,还包括了创建表格的操作,因此不需要手动在数据库中建立表格。注意替换代码中的“your_password”和“your_database”为你自己的密码和数据库名称。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值