b站学习地址:urllib获取网页数据
https://www.bilibili.com/video/BV12E411A7ZQ?p=18
1、get请求
import urllib.request # 指定url,获取网页数据
import urllib.parse # 解析器
response = urllib.request.urlopen("http://www.baidu.com")
print(response.read().decode('utf-8')) # 对获取到的网页源码进行utf-8的解码
2、post请求
# post请求 url="http://httpbin.org/post" 专门用来测试的网址 post可模拟用户真实登录
try:
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8') # 将信息转换成二进制数据包
response = urllib.request.urlopen("http://httpbin.org/post", data=data, timeout=1)
print(response.read().decode('utf-8'))
except Exception as e:
if hasattr(e, "code"): # 如果含有code属性就打印code信息
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
3、获取某个信息
response = urllib.request.urlopen("http://httpbin.org/get")
print(response.getheaders())
4、带headers封装post
# 带header发送 封装
url = "https://www.douban.com"
data = bytes(urllib.parse.urlencode({"hello": "world"}), encoding='utf-8')
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.198 Safari/537.36"
}
# 本质是告诉浏览器我们可以接收什么水平的信息,可理解为伪装成浏览器给服务器发送信息
req = urllib.request.Request(url=url, data=data, headers=headers, method="POST")
response = urllib.request.urlopen(req)
print(response.read().decode('utf-8'))
b站学习地址:BeautifulSoup解析获取到的网页数据
https://www.bilibili.com/video/BV12E411A7ZQ?p=20
1、定义网页对象,选择对应解析器
import re # 正则表达式
from bs4 import BeautifulSoup # 网页解析获取数据
# BeautifulSoup将复杂HTML文档转换成一个复杂树形结构,每个节点都是pyhon对象,所有对象可以归纳为4种
# -Tag
# -NavigableString
# -BeautifulSoup
# -Comment
file = open("./baidu.html", "rb") # 当前文件夹./ rb二进制读取
html = file.read()
bs = BeautifulSoup(html, "html.parser") # 解析器是html.parser
2、 获取Tag标签、标签内容、标签属性
# 拿标签Tag 只能拿到找到的第一个标签
print(bs.title)
print(bs.head)
# 标签里的内容
print(bs.title.string)
# 标签里面的属性值
print(bs.a.attrs)
3、文档遍历
# 文档的遍历
print(bs.head.contents)
print(bs.head.contents[1])
4、文档搜索
# 文档的搜索
t_list = bs.find_all("a", limit=3) # 字符串过滤,查找与字符串完全匹配的内容
t_list = bs.find_all(re.compile("a")) # 正则表达式 包含a的都找出来
# 根据函数的要求搜索
def name_is_exists(tag):
return tag.has_attr("name")
t_list = bs.find_all(name_is_exists)
for item in t_list:
print(item)
# 指定参数搜索
t_list = bs.find_all(id="head")
t_list = bs.find_all(class_=True) # 整个类别里面有个class
# 文本查找
t_list = bs.find_all(text=["hao123", "地图", "贴吧"])
t_list = bs.find_all(text=re.compile("\d")) # 正则表达式匹配数字
# css选择器
t_list = bs.select('title') # 按照标签来查找
t_list = bs.select(".mnav") # 按照类名来查找 前面加个. 表示类名
t_list = bs.select("#u1") # 按照id来查找
t_list = bs.select("a[class]='bri']") # 按照属性来查找
t_list = bs.select("head > title") # 通过子标签来查找 一层一层找下去
t_list = bs.select(".mnav ~ .bri") # 兄弟节点查找
print(t_list[0].get_text())
基本代码
https://www.bilibili.com/video/BV12E411A7ZQ?p=24
import re
from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
import xlwt
import sqlite3
url = "http://movie.douban.com/top250?start="
findLink = re.compile(r'<a href="(.*?)">') # 创建正则表达式规则
findImgSrc = re.compile(r'<img.*src="(.*?)"', re.S)
findTitle = re.compile(r'<span class="title">(.*?)</span>')
findRating = re.compile(r'<span class="rating_num" property="v:average">(.*?)</span>')
findJudge = re.compile(r'<span>(\d*)人评价</span>')
findIng = re.compile(r'<span class="inq">(.*)</span>')
findBd = re.compile(r'<p class="">(.*?)</p>', re.S)
def main():
datalist = getData(url)
savepath = "douban.xls"
dbpath = "movie.db"
# saveData(datalist, savepath) # excel存储
saveData2DB(datalist, dbpath) # 数据库存储
def getData(url):
datalist = []
for i in range(0, 10):
nexturl = url + str(i * 25)
html = askURL(nexturl)
soup = BeautifulSoup(html, "html.parser")
# 查找符合要求的字符串,成一个列表
for item in soup.find_all('div', class_="item"): # 找div 并且class是item
data = []
item = str(item) # 把查找到的变成str,便于处理
link = re.findall(findLink, item)[0] # 因为有两条只需要第一条即可
data.append(link)
imgSrc = re.findall(findImgSrc, item)[0]
data.append(imgSrc)
titles = re.findall(findTitle, item)
if len(titles) == 2: # 有中文名和外文名
ctitle = titles[0]
data.append(ctitle)
otitle = titles[1].replace("/", "")
data.append(otitle)
else:
data.append(titles[0])
data.append(' ') # 如无外文名留空
rating = re.findall(findRating, item)[0]
data.append(rating)
judge = re.findall(findJudge, item)[0]
data.append(judge)
ing = re.findall(findIng, item)
if len(ing) != 0:
ing = ing[0].replace("。", "")
data.append(ing)
else:
data.append(" ")
bd = re.findall(findBd, item)[0]
bd = re.sub('<br(\s+)?/>(\s+)?', " ", bd) # 去掉br
bd = re.sub('/', " ", bd) # 替换/
data.append(bd.strip())
datalist.append(data)
return datalist
def askURL(url):
html = ""
headers = {
"User-Agent": "Mozilla / 5.0(Windows NT 10.0;Win64;x64) AppleWebKit / 537.36 (KHTML, like Gecko) Chrome / 91.0.4472.114 Safari / 537.36 Edg / 91.0.864.59"
}
req = urllib.request.Request(url=url, headers=headers, method="POST")
try:
response = urllib.request.urlopen(req)
html = response.read().decode('utf-8')
except Exception as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
def saveData(datalist, savepath):
print("开始!")
book = xlwt.Workbook(encoding='utf-8', style_compression=0) # style_compression=0不压缩
sheet = book.add_sheet('douban250', cell_overwrite_ok=True) # cell_overwrite_ok=True可以覆盖单元格,默认为False
col = ("电影链接", "图片链接", "中文名", "外文名", "评分", "评价数", "概述", "相关信息")
for i in range(0, 8):
sheet.write(0, i, col[i])
for i in range(0, 250):
data = datalist[i]
for j in range(0, 8):
sheet.write(i + 1, j, data[j])
print("结束!")
book.save(savepath)
def saveData2DB(datalist, dbpath):
init_db(dbpath)
conn = sqlite3.connect(dbpath)
cur = conn.cursor()
for data in datalist:
for index in range(len(data)):
if index == 4 or index == 5:
continue
data[index] = '"' + str(data[index]) + '"'
sql = '''
insert into movie250 (
info_link,pic_link,cname,oname,score,rated,instroduction,info)
values(%s)''' % ",".join(data)
# print(sql)
cur.execute(sql)
conn.commit()
cur.close()
conn.close()
def init_db(dbpath): # 创建数据库
sql = '''
create table movie250(
id integer primary key autoincrement,
info_link text,
pic_link text,
cname varchar,
oname varchar,
score numeric,
rated numeric,
instroduction text,
info text
)
'''
conn = sqlite3.connect(dbpath) # 有则打开无则创建数据库文
cursor = conn.cursor() # 获取游标
cursor.execute(sql) # 执行操作
conn.commit() # 提交数据库操作 查询时不需要提交
conn.close() # 关闭数据库连接
if __name__ == "__main__":
main()
报错相关
object of type 'NoneType' has no len()
原因:调用的函数漏写返回值
UnboundLocalError: local variable 'a' referenced before assignment
原因:局部变量与全局变量名字重复
注意:会被豆瓣封IP