Python-urllib、BeautifulSoup爬取豆瓣数据

最新推荐文章于 2023-03-17 22:32:41 发布

Node_Su

最新推荐文章于 2023-03-17 22:32:41 发布

阅读量2.9k

点赞数 1

分类专栏：学习笔记文章标签： python

本文链接：https://blog.csdn.net/Node_Su/article/details/119785386

版权

学习笔记专栏收录该内容

48 篇文章 1 订阅

订阅专栏

b站学习地址：urllib获取网页数据

https://www.bilibili.com/video/BV12E411A7ZQ?p=18

1、get请求

import urllib.request  # 指定url，获取网页数据
import urllib.parse  # 解析器
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8'))  # 对获取到的网页源码进行utf-8的解码

2、post请求

# post请求  url="http://httpbin.org/post" 专门用来测试的网址 post可模拟用户真实登录
try:
    data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')  # 将信息转换成二进制数据包
    response = urllib.request.urlopen("http://httpbin.org/post", data=data, timeout=1)
    print(response.read().decode('utf-8'))
except Exception as e:
    if hasattr(e, "code"):  # 如果含有code属性就打印code信息
        print(e.code)
    if hasattr(e, "reason"):
        print(e.reason)

3、获取某个信息

response = urllib.request.urlopen("http://httpbin.org/get")
print(response.getheaders())

4、带headers封装post

# 带header发送  封装
url = "https://www.douban.com"
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
# 本质是告诉浏览器我们可以接收什么水平的信息，可理解为伪装成浏览器给服务器发送信息
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))

b站学习地址：BeautifulSoup解析获取到的网页数据

https://www.bilibili.com/video/BV12E411A7ZQ?p=20

1、定义网页对象，选择对应解析器

import re  # 正则表达式
from bs4 import BeautifulSoup  # 网页解析获取数据

# BeautifulSoup将复杂HTML文档转换成一个复杂树形结构，每个节点都是pyhon对象，所有对象可以归纳为4种
# -Tag
# -NavigableString
# -BeautifulSoup
# -Comment

file = open("./baidu.html", "rb")  # 当前文件夹./ rb二进制读取
html = file.read()
bs = BeautifulSoup(html, "html.parser")  # 解析器是html.parser

2、获取Tag标签、标签内容、标签属性

# 拿标签Tag 只能拿到找到的第一个标签
print(bs.title)
print(bs.head)
# 标签里的内容
print(bs.title.string)
# 标签里面的属性值
print(bs.a.attrs)

3、文档遍历

# 文档的遍历
print(bs.head.contents)
print(bs.head.contents[1])

4、文档搜索

# 文档的搜索
t_list = bs.find_all("a", limit=3)  # 字符串过滤,查找与字符串完全匹配的内容
t_list = bs.find_all(re.compile("a"))  # 正则表达式 包含a的都找出来


# 根据函数的要求搜索
def name_is_exists(tag):
    return tag.has_attr("name")


t_list = bs.find_all(name_is_exists)
for item in t_list:
    print(item)

# 指定参数搜索
t_list = bs.find_all(id="head")
t_list = bs.find_all(class_=True)  # 整个类别里面有个class

# 文本查找
t_list = bs.find_all(text=["hao123", "地图", "贴吧"])
t_list = bs.find_all(text=re.compile("\d"))  # 正则表达式匹配数字

# css选择器
t_list = bs.select('title')  # 按照标签来查找
t_list = bs.select(".mnav")  # 按照类名来查找 前面加个. 表示类名
t_list = bs.select("#u1")  # 按照id来查找
t_list = bs.select("a[class]='bri']")  # 按照属性来查找
t_list = bs.select("head > title")  # 通过子标签来查找 一层一层找下去

t_list = bs.select(".mnav ~ .bri")  # 兄弟节点查找
print(t_list[0].get_text())

基本代码

https://www.bilibili.com/video/BV12E411A7ZQ?p=24

import re
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import xlwt
import sqlite3

url = "http://movie.douban.com/top250?start="
findLink = re.compile(r'<a href="(.*?)">')  # 创建正则表达式规则
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findIng = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)


def main():
    datalist = getData(url)
    savepath = "douban.xls"
    dbpath = "movie.db"
    # saveData(datalist, savepath)  # excel存储
    saveData2DB(datalist, dbpath)  # 数据库存储


def getData(url):
    datalist = []
    for i in range(0, 10):
        nexturl = url + str(i * 25)
        html = askURL(nexturl)

        soup = BeautifulSoup(html, "html.parser")
        # 查找符合要求的字符串，成一个列表
        for item in soup.find_all('div', class_="item"):  # 找div 并且class是item
            data = []
            item = str(item)  # 把查找到的变成str，便于处理

            link = re.findall(findLink, item)[0]  # 因为有两条只需要第一条即可
            data.append(link)
            imgSrc = re.findall(findImgSrc, item)[0]
            data.append(imgSrc)
            titles = re.findall(findTitle, item)
            if len(titles) == 2:  # 有中文名和外文名
                ctitle = titles[0]
                data.append(ctitle)
                otitle = titles[1].replace("/", "")
                data.append(otitle)
            else:
                data.append(titles[0])
                data.append(' ')  # 如无外文名留空

            rating = re.findall(findRating, item)[0]
            data.append(rating)
            judge = re.findall(findJudge, item)[0]
            data.append(judge)

            ing = re.findall(findIng, item)
            if len(ing) != 0:
                ing = ing[0].replace("。", "")
                data.append(ing)
            else:
                data.append(" ")

            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd)  # 去掉br
            bd = re.sub('/', " ", bd)  # 替换/
            data.append(bd.strip())

            datalist.append(data)

    return datalist


def askURL(url):
    html = ""
    headers = {
        "User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 91.0.4472.114 Safari / 537.36 Edg / 91.0.864.59"
    }
    req = urllib.request.Request(url=url, headers=headers, method="POST")
    try:

        response = urllib.request.urlopen(req)
        html = response.read().decode('utf-8')

    except Exception as e:
        if hasattr(e, "code"):
            print(e.code)
        if hasattr(e, "reason"):
            print(e.reason)

    return html


def saveData(datalist, savepath):
    print("开始！")
    book = xlwt.Workbook(encoding='utf-8', style_compression=0)  # style_compression=0不压缩
    sheet = book.add_sheet('douban250', cell_overwrite_ok=True)  # cell_overwrite_ok=True可以覆盖单元格，默认为False
    col = ("电影链接", "图片链接", "中文名", "外文名", "评分", "评价数", "概述", "相关信息")
    for i in range(0, 8):
        sheet.write(0, i, col[i])
    for i in range(0, 250):
        data = datalist[i]
        for j in range(0, 8):
            sheet.write(i + 1, j, data[j])
    print("结束！")
    book.save(savepath)


def saveData2DB(datalist, dbpath):
    init_db(dbpath)
    conn = sqlite3.connect(dbpath)
    cur = conn.cursor()

    for data in datalist:
        for index in range(len(data)):
            if index == 4 or index == 5:
                continue
            data[index] = '"' + str(data[index]) + '"'
        sql = '''
                insert into movie250 (
                info_link,pic_link,cname,oname,score,rated,instroduction,info) 
                values(%s)''' % ",".join(data)
        # print(sql)
        cur.execute(sql)
        conn.commit()
    cur.close()
    conn.close()


def init_db(dbpath):  # 创建数据库
    sql = '''
        create table movie250(

        id integer primary key autoincrement,
        info_link text,
        pic_link text,
        cname varchar,
        oname varchar,
        score numeric,
        rated numeric,
        instroduction text,
        info text
        )
    '''
    conn = sqlite3.connect(dbpath)  # 有则打开无则创建数据库文
    cursor = conn.cursor()  # 获取游标
    cursor.execute(sql)  # 执行操作
    conn.commit()  # 提交数据库操作 查询时不需要提交
    conn.close()  # 关闭数据库连接


if __name__ == "__main__":
    main()

报错相关

object of type 'NoneType' has no len()

原因：调用的函数漏写返回值

UnboundLocalError: local variable 'a' referenced before assignment

原因：局部变量与全局变量名字重复

注意：会被豆瓣封IP

Node_Su

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
Python-urllib、BeautifulSoup爬取豆瓣数据

1、get请求import urllib.request # 指定url，获取网页数据import urllib.parse # 解析器response = urllib.request.urlopen("http://www.baidu.com")print(response.read().decode('utf-8')) # 对获取到的网页源码进行utf-8的解码2、post请求# post请求 url="http://httpbin.org/post" 专门用来测试的网址
复制链接

扫一扫