python爬虫入门学习笔记

在B站学习python爬某瓣top250的入门笔记

2021-03-21 从B站学习了爬虫的基础知识 并记录.本次学习爬虫用到3个包 分别是:urllib, bs4和sqlite3

urllib 用来执行对网页的请求并获取响应信息

# GET 请求
html = urllib.request.urlopen("http://www.baidu.com")
print(html.read().decode("utf-8"))  # 解析并解码

# POST 请求
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
html = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(html.read().decode("utf-8"))  # 解析并解码

# 超时时间及异常捕捉
try:
    html = urllib.request.urlopen("http://httpbin.org//get", timeout=0.1)
    print(html.read().decode("utf-8"))  # 解析并解码
except urllib.error.URLError as e:
    print("timeout", e)

response = urllib.request.urlopen("http://httpbin.org//get")
print(response.status)

# 很多网站为了防止爬虫需要校验head 所以我们访问的时候需要携带head来请求
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
url = "http://httpbin.org/post"
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
resp = urllib.request.urlopen(req).read().decode("utf-8")
print(resp)

# 访问某瓣
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
url = "http://www.*ban.com"  # 网站自己拼写
req = urllib.request.Request(url=url, headers=headers, method="GET")
resp = urllib.request.urlopen(req).read().decode("utf-8")
print(resp)

bs4 则负责对网页中的元素内容进行解析

from bs4 import BeautifulSoup

# beaytufyksoup4
from bs4 import BeautifulSoup
import re

# 读取文件到内存
file = open("./baidu.html", "rb")
html = file.read().decode("utf-8")
bs = BeautifulSoup(html, "html.parser")

# bs.元素名 获取元素下的所有内容
print(bs.title)
print(bs.a)
print(bs.head)

# 1.Tag 标签及其内容 :拿到他所找到的第一个内容
print(type(bs.head)) #tag
# 2.NavigableString 标签里的内容(字符串)
print(type(bs.title.string)) #NavigableString
print(type(bs.a.attrs))
# 3.BeautifulSoup 整个文档对象
print(type(bs))
print(bs)
print(bs.a.string)
# 4.Comment 注释类型
print(type(bs.a.string))


# 文档的遍历
print(bs.head.contents)

# 文档的搜索
# 1. find_all 根据标签名查询所有 查找字符串完全匹配的内容
aList = bs.find_all("a")

# 2.正则搜搜 seach
aList = bs.find_all(re.compile("a"))


# 3.方法搜索 查询标签中包含name的字段
def name_is_exists(tag):
    return tag.has_attr("name")
aList = bs.find_all(name_is_exists)
print(aList)
for item in aList:
    print(item)


# 2.kwargs 参数 根据元素类型搜索
aList = bs.find_all(id="head")
aList = bs.find_all(class_=True)
aList = bs.find_all(href="http://ir.baidu.com")
aList = bs.find_all(href="http://ir.baidu.com")
for item in aList:
    print(item)

# 3.文本搜索 根据关键字 根据关键字集合 根据正则
aList = bs.find_all(text="hao123")
aList = bs.find_all(text=["hao123", "地图", "贴吧"])
aList = bs.find_all(text=re.compile("\d"))
for item in aList:
    print(item)

# 4. limit参数
aList = bs.find_all("a", limit=3)
for item in aList:
    print(item)

# css选择器
aList = bs.select("class")  # 根据标签名获取标签集合         <title>百度一下,你就知道</title>
aList = bs.select(".mnav") #根据类名                      class <a class="mnav c-font-normal c-color-t" h
aList = bs.select("#u1")   #根据id u1查找                 div class="s-top-right s-isindex-wrap" id="u1">
aList = bs.select("a[class='toindex']")  # 通过属性查找      <a class="toindex" href="/"><!--百度首页--></a>
aList = bs.select("head > link > title")  # 子类选择器       <title>百度一下,你就知道</title>

aList = bs.select(".u1 ~ .head_wrapper")  # 根据兄弟标签获取
for item in aList:
    print(item)

sqlite3负责简单的数据操作

import sqlite3

conn = sqlite3.connect("test.db")
print("成功连接到数据库")
c = conn.cursor()       # 获取游标
# 建表
sql = '''
    create table company
        (id int primary key not null,
        name text not null,
        age int not null,
        address text 
        salary real
        );
'''
c.execute(sql)  # 执行sql
conn.commit()   # 提交sql
conn.close()    # 关闭数据库连接
print("建表成功")

最终代码

#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author:Evolve Hsu
@file:testUrlLib.py
@time:2021/03/20
"""
from bs4 import BeautifulSoup  # 网页解析 获取数据
import re  # 正则表达式文字匹配
import urllib.request, urllib.error  # 制定URL 获取网页数据
import sqlite3  # 进行sqllite数据库操作


def main():
    baseUrl = "https://movie.*ban.com/top250?start="  #网址自己拼写 
    # 1.爬取网页
    dataList = getDataByUrl(baseUrl)
    # savePath = "C:\\Users\\24855\\Desktop\豆瓣电影TOP250\\豆瓣电影TOP250.xls"
    dbPath = "movie.db"

    saveData(dbPath, dataList)

    # askUrl(baseUrl)


# (.*?) 正则匹配任意字符
# 影片详情链接
findLink = re.compile(r'<a href="(.*?)">')  # 正则对象
# 图片连接
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)  # re.S 过滤换行符
# 片名
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S)  # re.S 过滤换行符
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
# 评价人数
findJudge = re.compile(r'<span>(\d*)人评价</span>')
# 概述
findInq = re.compile(r'<span class="inq">(.*?)</span>')
# 相关内容
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)


# 爬取网页
def getDataByUrl(baseUrl):
    dataList = []
    for num in range(0, 10):  # 这里的分页自己去计算 就不详细写了
        url = baseUrl + str(num * 25)
        html = askUrl(url)
        # 2.逐一解析数据
        soup = BeautifulSoup(html, "html.parser")  # 使用html解析器解析
        num = 0
        for item in soup.find_all('div', class_="item"):
            # print(item) #查看所有电影
            data = []  # 定义每一部电影的信息对象
            item = str(item)

            # 影片详情链接
            link = re.findall(findLink, item)[0]  # re库通过正则查找指定字符串
            data.append(link)

            # 添加图片连接
            imgSrc = re.findall(findImgSrc, item)[0]  # 图片
            data.append(imgSrc)

            # 添加片名
            titles = re.findall(findTitle, item)  # 片名字段可能是一个或多个
            cTitle = titles[0]  # 中文名
            oTitle = ""  # 外文名
            if (len(titles) > 1):
                oTitle = titles[1].replace("/", "").replace(" ", "")  # 外文名去/处理
            data.append(cTitle)
            data.append(oTitle)

            # 添加影片评分
            rating = re.findall(findRating, item)[0]
            data.append(rating)

            # 添加评价人数
            judgeNum = re.findall(findJudge, item)[0]
            data.append(judgeNum)

            # 添加概述
            inq = re.findall(findInq, item)
            if len(inq) != 0:
                inq = inq[0].replace("。", "")
            else:
                inq = ""
            data.append(inq)

            # 添加相关内容
            bd = re.findall(findBd, item)[0]
            bd = re.sub('<br(\s+)?/>(\s+?)', " ", bd)  # 替换<br/>
            bd = re.sub("/", " ", bd)  # 替换 /
            bd = bd.replace(" ", "")
            data.append(bd.strip())  # 去前后空格

            dataList.append(data)  # 数据放入集合
            break
    # print(dataList)
    return dataList


#
def saveData(savePath, dataList):
    init_db(savePath)  # 初始化数据库
    conn = sqlite3.connect(savePath)
    c = conn.cursor()

    # 遍历组成sql
    for data in dataList:
        for index in range(len(data)):
            if index == 4 or index == 5:
                continue
            data[index] = '"' + data[index] + '"'  # 数据转字符串
        sql = '''
        insert into movie250 (into_link,pic_link,cname,oname,score,rated,instroduction,info)
        values (%s)''' % ",".join(data)
        print(sql)
        c.execute(sql)
        conn.commit()
    c.close()
    conn.close()


def init_db(savePath):
    sql = '''
        create table movie250
        (
            id integer primary key autoincrement,
            into_link text,
            pic_link text,
            cname varchar ,
            oname varchar ,
            score numeric ,
            rated numeric ,
            instroduction text ,
            info text
        );
    '''
    conn = sqlite3.connect(savePath)
    c = conn.cursor()
    c.execute(sql)
    conn.commit()
    conn.close()
    print("init_db success")


# 3.保存数据
# print("savePath", savePath)


def askUrl(url):
# header 信息 
    headers = {
        'User-Agent': ' Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54',
    }
    request = urllib.request.Request(url=url, headers=headers)
    html = ""
    try:
        resp = urllib.request.urlopen(request)
        html = resp.read().decode("utf-8")
        # print(html)
    except urllib.error.URLError as e:
        print(e)
    return html

# 入口
if __name__ == "__main__":
    main()

评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值