python爬虫入门学习笔记
在B站学习python爬某瓣top250的入门笔记
2021-03-21 从B站学习了爬虫的基础知识 并记录.本次学习爬虫用到3个包 分别是:urllib, bs4和sqlite3
urllib 用来执行对网页的请求并获取响应信息
# GET 请求
html = urllib.request.urlopen("http://www.baidu.com")
print(html.read().decode("utf-8")) # 解析并解码
# POST 请求
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding="utf-8")
html = urllib.request.urlopen("http://httpbin.org/post", data=data)
print(html.read().decode("utf-8")) # 解析并解码
# 超时时间及异常捕捉
try:
html = urllib.request.urlopen("http://httpbin.org//get", timeout=0.1)
print(html.read().decode("utf-8")) # 解析并解码
except urllib.error.URLError as e:
print("timeout", e)
response = urllib.request.urlopen("http://httpbin.org//get")
print(response.status)
# 很多网站为了防止爬虫需要校验head 所以我们访问的时候需要携带head来请求
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
url = "http://httpbin.org/post"
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
resp = urllib.request.urlopen(req).read().decode("utf-8")
print(resp)
# 访问某瓣
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54"
}
url = "http://www.*ban.com" # 网站自己拼写
req = urllib.request.Request(url=url, headers=headers, method="GET")
resp = urllib.request.urlopen(req).read().decode("utf-8")
print(resp)
bs4 则负责对网页中的元素内容进行解析
from bs4 import BeautifulSoup
# beaytufyksoup4
from bs4 import BeautifulSoup
import re
# 读取文件到内存
file = open("./baidu.html", "rb")
html = file.read().decode("utf-8")
bs = BeautifulSoup(html, "html.parser")
# bs.元素名 获取元素下的所有内容
print(bs.title)
print(bs.a)
print(bs.head)
# 1.Tag 标签及其内容 :拿到他所找到的第一个内容
print(type(bs.head)) #tag
# 2.NavigableString 标签里的内容(字符串)
print(type(bs.title.string)) #NavigableString
print(type(bs.a.attrs))
# 3.BeautifulSoup 整个文档对象
print(type(bs))
print(bs)
print(bs.a.string)
# 4.Comment 注释类型
print(type(bs.a.string))
# 文档的遍历
print(bs.head.contents)
# 文档的搜索
# 1. find_all 根据标签名查询所有 查找字符串完全匹配的内容
aList = bs.find_all("a")
# 2.正则搜搜 seach
aList = bs.find_all(re.compile("a"))
# 3.方法搜索 查询标签中包含name的字段
def name_is_exists(tag):
return tag.has_attr("name")
aList = bs.find_all(name_is_exists)
print(aList)
for item in aList:
print(item)
# 2.kwargs 参数 根据元素类型搜索
aList = bs.find_all(id="head")
aList = bs.find_all(class_=True)
aList = bs.find_all(href="http://ir.baidu.com")
aList = bs.find_all(href="http://ir.baidu.com")
for item in aList:
print(item)
# 3.文本搜索 根据关键字 根据关键字集合 根据正则
aList = bs.find_all(text="hao123")
aList = bs.find_all(text=["hao123", "地图", "贴吧"])
aList = bs.find_all(text=re.compile("\d"))
for item in aList:
print(item)
# 4. limit参数
aList = bs.find_all("a", limit=3)
for item in aList:
print(item)
# css选择器
aList = bs.select("class") # 根据标签名获取标签集合 <title>百度一下,你就知道</title>
aList = bs.select(".mnav") #根据类名 class <a class="mnav c-font-normal c-color-t" h
aList = bs.select("#u1") #根据id u1查找 div class="s-top-right s-isindex-wrap" id="u1">
aList = bs.select("a[class='toindex']") # 通过属性查找 <a class="toindex" href="/"><!--百度首页--></a>
aList = bs.select("head > link > title") # 子类选择器 <title>百度一下,你就知道</title>
aList = bs.select(".u1 ~ .head_wrapper") # 根据兄弟标签获取
for item in aList:
print(item)
sqlite3负责简单的数据操作
import sqlite3
conn = sqlite3.connect("test.db")
print("成功连接到数据库")
c = conn.cursor() # 获取游标
# 建表
sql = '''
create table company
(id int primary key not null,
name text not null,
age int not null,
address text
salary real
);
'''
c.execute(sql) # 执行sql
conn.commit() # 提交sql
conn.close() # 关闭数据库连接
print("建表成功")
最终代码
#!/usr/bin/python
# -*- coding: UTF-8 -*-
"""
@author:Evolve Hsu
@file:testUrlLib.py
@time:2021/03/20
"""
from bs4 import BeautifulSoup # 网页解析 获取数据
import re # 正则表达式文字匹配
import urllib.request, urllib.error # 制定URL 获取网页数据
import sqlite3 # 进行sqllite数据库操作
def main():
baseUrl = "https://movie.*ban.com/top250?start=" #网址自己拼写
# 1.爬取网页
dataList = getDataByUrl(baseUrl)
# savePath = "C:\\Users\\24855\\Desktop\豆瓣电影TOP250\\豆瓣电影TOP250.xls"
dbPath = "movie.db"
saveData(dbPath, dataList)
# askUrl(baseUrl)
# (.*?) 正则匹配任意字符
# 影片详情链接
findLink = re.compile(r'<a href="(.*?)">') # 正则对象
# 图片连接
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S) # re.S 过滤换行符
# 片名
findTitle = re.compile(r'<span class="title">(.*?)</span>', re.S) # re.S 过滤换行符
# 影片评分
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
# 评价人数
findJudge = re.compile(r'<span>(\d*)人评价</span>')
# 概述
findInq = re.compile(r'<span class="inq">(.*?)</span>')
# 相关内容
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
# 爬取网页
def getDataByUrl(baseUrl):
dataList = []
for num in range(0, 10): # 这里的分页自己去计算 就不详细写了
url = baseUrl + str(num * 25)
html = askUrl(url)
# 2.逐一解析数据
soup = BeautifulSoup(html, "html.parser") # 使用html解析器解析
num = 0
for item in soup.find_all('div', class_="item"):
# print(item) #查看所有电影
data = [] # 定义每一部电影的信息对象
item = str(item)
# 影片详情链接
link = re.findall(findLink, item)[0] # re库通过正则查找指定字符串
data.append(link)
# 添加图片连接
imgSrc = re.findall(findImgSrc, item)[0] # 图片
data.append(imgSrc)
# 添加片名
titles = re.findall(findTitle, item) # 片名字段可能是一个或多个
cTitle = titles[0] # 中文名
oTitle = "" # 外文名
if (len(titles) > 1):
oTitle = titles[1].replace("/", "").replace(" ", "") # 外文名去/处理
data.append(cTitle)
data.append(oTitle)
# 添加影片评分
rating = re.findall(findRating, item)[0]
data.append(rating)
# 添加评价人数
judgeNum = re.findall(findJudge, item)[0]
data.append(judgeNum)
# 添加概述
inq = re.findall(findInq, item)
if len(inq) != 0:
inq = inq[0].replace("。", "")
else:
inq = ""
data.append(inq)
# 添加相关内容
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+?)', " ", bd) # 替换<br/>
bd = re.sub("/", " ", bd) # 替换 /
bd = bd.replace(" ", "")
data.append(bd.strip()) # 去前后空格
dataList.append(data) # 数据放入集合
break
# print(dataList)
return dataList
#
def saveData(savePath, dataList):
init_db(savePath) # 初始化数据库
conn = sqlite3.connect(savePath)
c = conn.cursor()
# 遍历组成sql
for data in dataList:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"' + data[index] + '"' # 数据转字符串
sql = '''
insert into movie250 (into_link,pic_link,cname,oname,score,rated,instroduction,info)
values (%s)''' % ",".join(data)
print(sql)
c.execute(sql)
conn.commit()
c.close()
conn.close()
def init_db(savePath):
sql = '''
create table movie250
(
id integer primary key autoincrement,
into_link text,
pic_link text,
cname varchar ,
oname varchar ,
score numeric ,
rated numeric ,
instroduction text ,
info text
);
'''
conn = sqlite3.connect(savePath)
c = conn.cursor()
c.execute(sql)
conn.commit()
conn.close()
print("init_db success")
# 3.保存数据
# print("savePath", savePath)
def askUrl(url):
# header 信息
headers = {
'User-Agent': ' Mozilla/5.0 (Windows NT 10.0 Win64 x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.90 Safari/537.36 Edg/89.0.774.54',
}
request = urllib.request.Request(url=url, headers=headers)
html = ""
try:
resp = urllib.request.urlopen(request)
html = resp.read().decode("utf-8")
# print(html)
except urllib.error.URLError as e:
print(e)
return html
# 入口
if __name__ == "__main__":
main()