爬虫爬取豆瓣电影top250

最新研究了下爬虫,且将信息保存到EXCEL和sqlite; 代码供参考;

代码如下:

# This is a sample Python script.

# Press Shift+F10 to execute it or replace it with your code.
# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings.

from bs4 import BeautifulSoup
import urllib.request,urllib.error
import sqlite3
import re
import openpyxl


def main():

    # 1, 爬取网页
    baseurl = "https://movie.douban.com/top250?start="
    datalist = getData(baseurl)

    # 2, 保存数据,分别保存到excel和sqlite DB中
    savepath = ".\\豆瓣电影Top250.xlsx"
    saveData(datalist, savepath)
    saveDb(datalist)
    print("爬取完毕")

# 影片详情
findLink = re.compile(r"<a href=(.*?)>")                                      #创建正则表达式对象,表示规则(字符串模式)
# 图片地址
findImg = re.compile(r'<img.*src=(.*)" width="100"/>', re.S)
# 影片片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# 影片评价人数
findComment = re.compile(r'<span>(.*)人评价</span>')
# 影片概况
findInq = re.compile(r'<span class="inq">(.*)</span>')
#影片相关内容
findInfo = re.compile(r'<p class="">(.*?)<br/>', re.S)   # re.S 让换行符包含在字符中


#根据传入的URL爬取豆瓣网站信息
def getData(baseurl):
    url = baseurl
    data_all = []
    for page in range(0,10):
        #获取10页的URL
        url = baseurl + str(page*25)
        html = askURL(url)
        #2,逐一解析网页
        soup = BeautifulSoup(html,"html.parser")
        for item in soup.find_all('div', class_="item"):
            data = []  # 保存一部电影的信息
            # print(item)
            item = str(item)
            link = re.findall(findLink, item)[0]           #re库通过正则查找字符串
            data.append(link)
            # print('影片链接:', link)
            img = re.findall(findImg, item)[0]
            data.append(img)
            # print('影片海报:', img)
            title = re.findall(findTitle, item)[0]
            data.append(title)
            # print('影片名称:', title)
            rating = re.findall(findRating, item)[0]
            data.append(rating)
            # print('影片评分:', rating)
            comment = re.findall(findComment, item)[0]
            data.append(comment)
            # print('评价人数:', comment)
            inq = re.findall(findInq, item)
            if len(inq) != 0:
                inq = inq[0].replace("。" , "")
                data.append(inq.strip())
            else:
                data.append(" ")
            # print('影片简介:', inq)
            info = re.findall(findInfo, item)[0]
            info = re.sub("<br(\s+)/>(\s+?)"," ",info)
            info.replace("<br/>" , '')
            data.append(info.strip())
            # data.append('-------------------------------')
            data_all.append(data)

    return data_all


#根据传入的URL, 伪装为浏览器进行网站访问
def askURL(url):
    # 伪装为浏览器
    headers = {
        "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36"
        # 用户代理,告诉服务器,客户端的类型;本质上是告诉浏览器可以接收什么水平的数据
    }
    req = urllib.request.Request(url=url, headers=headers)
    try:
        #打开请求的网页
        response = urllib.request.urlopen(req)
        #获取HTML页面内容
        html = response.read().decode('utf-8')
        bs = BeautifulSoup(html, "html.parser")
        t_list = bs.select(".title")
        # for item in t_list:
        #     print(item.string)
            # pat = re.compile("^[\u4e00-\u9fa5]{0,}$")       #匹配中文
            # name = pat.search(item.string)
            # print(name)
        # print(html)
    except urllib.error.URLError as e:
        if hasattr(e.code):
            print(e.code)
        if hasattr(e.reason):
            print(e.reason)
    return html


#将爬取结果保存到EXCEL
def saveData(datalist, savepath):
    print('saving excel...')
    workbook = openpyxl.Workbook()                      # 创建workbook对象
    worksheet = workbook.create_sheet("sheet1")         # 创建工作表单
    #保存抬头
    title = ['影片链接', '影片海报', '影片名称', '影片评分', '评价人数', '影片简介', '其他信息']
    for i in range(0,7):
        worksheet.cell(1, i + 1, title[i])
    #保存内容
    for row in range(0, len(datalist)):
        for col in range(0, len(datalist[row])):
            worksheet.cell(row + 2, col + 1, datalist[row][col])
    workbook.save(savepath)

#将爬取结果保存到sqlite
def saveDb(datalist):
    print('saving db...')

    #创建连接
    conn = sqlite3.connect('DoubanMovie.db')
    #执行SQL
    sql_create = '''
        CREATE TABLE IF NOT EXISTS movie_top250(
        id integer primary key,
        link text,
        pic text,
        name text,
        rate real,
        rate_account real,
        introduction text,
        info text
        )
    '''

    cursor = conn.cursor()
    cursor.execute(sql_create)

    #保存影片信息
    try:
        for item in datalist:
            sql_insert = """insert into movie_top250(link,pic,name,rate,rate_account,introduction,info) values('{0}','{1}','{2}',{3},{4},"{5}","{6}")""".format(item[0],item[1],item[2],item[3],item[4],item[5],item[6])
            cursor.execute(sql_insert)
    except BaseException as e:
                print('except...',e)
    finally:
        # 提交sql
        conn.commit()
        # 关闭连接
        conn.close()


# Press the green button in the gutter to run the script.
if __name__ == '__main__':
    main()

EXCEL信息:

 

 sqlite 信息:

 

评论 5
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值