python爬虫框架

最新推荐文章于 2024-08-03 19:27:22 发布

小趴菜的春天

最新推荐文章于 2024-08-03 19:27:22 发布

阅读量151

点赞数

文章标签： python 爬虫正则表达式

本文链接：https://blog.csdn.net/weixin_44168245/article/details/119175857

版权

爬取豆瓣top250的电影数据，保存至excel中，再将数据导入MySQL中

from bs4 import BeautifulSoup  # 网页解析，获取数据
import re  # 正则
import urllib.request, urllib.error  # 制定url，获取网页
import xlwt  # 进行excel操作



# b1.影片链接规则（ 正则表达式对象，表示规则）
findLink = re.compile(r'<a href="(.*?)">')
# b2.图片链接规则
findimg = re.compile(r'<img.*src="(.*?)"', re.S)  # re.S让换行符包含在字符中
# b3.影片名
findTitle = re.compile(r'<span class="title">(.*)</span>')
# b4.评分
findscore = re.compile(r'<span class="rating_num" property="v:average">(.*)</span>')
# b5.评价人数
findnum = re.compile(r'<span>(\d*)人评价</span>')
# b6.概况
findinq = re.compile(r'<span class="inq">(.*)</span>')
# b7.影片相关内容
findcontext = re.compile(r'<p class="">(.*?)</p>', re.S)


def main():
    url1 = "https://movie.douban.com/top250?start="
    # 1
    datalist = getData(url1)
    savepath = "豆瓣1.xls"
    # 3
    saveData(datalist, savepath)


# 1.爬取网页
def getData(url1):
    datalist = []
    # a1
    for i in range(0, 10):  # 调用获取页面信息函数
        url2 = url1 + str(i * 25)
        html1 = askURL(url2)  # 保存获取的网页源码

        # 2.解析数据(逐一解析)
        # 定位特定标签位置
        soup = BeautifulSoup(html1, "html.parser")  # html.parser是html解析器
        for item in soup.find_all('div', class_="item"):  # 查找符合要求的string,形成列表;class_ 加下划线表示div属性
            # print(item)#测试:查看item全部信息
            data = []  # 保存一部电影全部信息
            item = str(item)  # 转换成字符串

            # b1.获取影片链接
            link = re.findall(findLink, item)[0]  # 通过正则表达式查找指定字符串
            # print(link)#测试
            data.append(link)

            # b2.获取图片
            img = re.findall(findimg, item)[0]
            data.append(img)

            # b3
            title = re.findall(findTitle, item)
            if len(title) == 2:
                ctitle = title[0]
                data.append(ctitle)
                otitle = title[1].replace("/", "")
                data.append(otitle)
            else:
                data.append(title[0])
                data.append(" ")

            # b4
            score = re.findall(findscore, item)[0]
            data.append(score)

            # b5
            Num = re.findall(findnum, item)[0]
            data.append(Num)

            # b6
            inq = re.findall(findinq, item)
            if len(inq) != 0:
                inq = inq[0].replace("。", " ")
                data.append(inq)
            else:
                data.append(" ")

            # b7
            context = re.findall(findcontext, item)[0]
            context = re.sub('<br(\s+)?/>(\s+)?', " ", context)
            context = re.sub('/', " ", context)
            data.append(context.strip())  # strip（）去空格

            datalist.append(data)  # 处理好的一部电影放入datalist

    #print(datalist)  # 测试
    return datalist


# a1.得到指定一个url网页
def askURL(url):
    # 用户代理,模拟web头部信息
    head = {"User-Agent": "User-Agent: Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Mobile Safari/537.36"}
    req = urllib.request.Request(url, headers=head)  # 发消息
    html = ""

    try:
        response = urllib.request.urlopen(req)
        html = response.read().decode("utf-8")  # 读取解码html
        # print(html)
    except urllib.error.URLError as e:  # 显示错误
        if hasattr(e, "code"):
            print(e.code)  # 打印编码问题
        if hasattr(e, "reason"):
            print(e.reason)  # 打印原因

    return html


# 3.保存数据
def saveData(datalist, savepath):
    workbook = xlwt.Workbook(encoding="utf-8", style_compression=0)  # 创建workbook对象
    sheet = workbook.add_sheet('豆瓣电影top250', cell_overwrite_ok=True)  # 创建工作表,属性是覆盖
    col = ("电影链接", "图片链接", "影片中文名", "影片外国名", "评分", "评分数", "概况", "sss")
    for i in range(0, 8):
        sheet.write(0, i, col[i])  # 列名
    for i in range(0, 250):
        print("%d条" % i)
        data = datalist[i]
        for j in range(0, 8):
            sheet.write(i + 1, j, data[j])  # 数据

    workbook.save(savepath)


if __name__ == "__main__":  # 程序人口
    # 调用函数
    main()
    print("爬取完毕")

请添加图片描述

将excel表的数据导入mysql中

from sqlalchemy import create_engine
import pymysql
import pandas as pd
# 创建一个mysql连接器，用户名为root，密码1234
# 地址为localhost，数据库名称为house，编码为utf8mb4

engine = create_engine('mysql+pymysql://root:1234@localhost:3308/house?charset=utf8')
print(engine)

# 使用read_sql_query查看数据库中的数据表数目
formlist = pd.read_sql_query('show tables', con=engine)
print('house数据库数据表清单为:','\n', formlist)
# 读取user.xlsx文件
data = pd.read_excel('E:/pachong/pachong_new/豆瓣1.xls')
print('爬下的资料为:',data)
#使用to_sql存储数据,'douban'为数据库表名
data.to_sql('douban', con=engine, index=False, if_exists='replace')

在这里插入图片描述

小趴菜的春天

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
python爬虫框架

爬取豆瓣top250的电影数据，保存至excel中from bs4 import BeautifulSoup # 网页解析，获取数据import re # 正则import urllib.request, urllib.error # 制定url，获取网页import xlwt # 进行excel操作import sqlite3 # sqline数据库# b1.影片链接规则（正则表达式对象，表示规则）findLink = re.compile(r'<a href="(.*
复制链接

扫一扫