import urllib.request, urllib.error
import gzip
def ungzip(data):
try:
data=gzip.decompress(data)
except:
pass
return data
def obtainHtml(url):
head = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/93.0.4577.63 Safari/537.36',
'Cookie': ' ', # 这里看你们的浏览器里有没有,你先登录百度账号,才会有的
'Accept': 'image/avif,image/webp,image/apng,image/svg+xml,image/*,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9'
}
request = urllib.request.Request(url, headers=head)
html = ""
try:
response = urllib.request.urlopen(request)
html = ungzip(response.read()).decode("utf-8")
#html = response.read().decode("utf-8")
except urllib.error.URLError as e:
if hasattr(e, "code"):
print(e.code)
if hasattr(e, "reason"):
print(e.reason)
return html
from bs4 import BeautifulSoup
import re
findImg = re.compile(r'<balanceWeightImg .*?src="(.*?)".*?>', re.S)# 制定规则 ()一个组 .*表示0个或多个字符 这种概念有一个或0个
def getData(html):
soup = BeautifulSoup(html, "html.parser")
#find_all(tag,attributes,recurisive, text, limit, keywords)
#tag,即标签名,可以寻找单个标签find_all('h1'),也可以寻找一堆标签find_all(['h1','h2','h3'])
items = soup.find_all("div", {'id': 'wrapper'})
print(items)
data = []
for item in items:
item = str(item)
ImgSrc = re.findall(findImg, item)
data.append(ImgSrc)
strChange = re.compile(r".*?//(.*?)'.*?")
i = 0
for ImgSrc in data:
if len(ImgSrc) != 0:
ImgSrc = str(ImgSrc[0])
ImgSrc = re.findall(strChange, ImgSrc)[0]
#print(ImgSrc)
# 保存图片
# response = urllib.request.urlopen(ImgSrc)
#with open("balanceWeightImg" + str(i), "wb") as f:
# f.write(response.read())
if __name__ == "__main__":
url = "https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%C6%BD%BA%E2%C5%E4%D6%D8%BF%E9%B1%EA%CA%B6&fr=ala&ala=1&alatpl=normal&pos=0&dyTabStr=MCwzLDUsMiwxLDYsNCw3LDgsOQ%3D%3D"
html = obtainHtml(url)
#print(html)
getData(html)
简单的python爬虫方法(1)
最新推荐文章于 2023-05-04 11:44:20 发布