1、通过关键字进行搜索
import requests
#百度关键词搜索API: https://www.baidu.com/s?wd=keyword
kv = {'wd':'Python'}
header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'}
#添加Accept跳过百度验证
r = requests.get("https://www.baidu.com/s",headers = header,params = kv)
print(r.request.url)
print(r.request.headers)
print(r.status_code,r.encoding,len(r.text))
r.encoding = 'utf-8'
print(r.text)
2、爬取网络图片将其保存到本地
import os
import requests
url = "https://www.zhifure.com/upload/images/2018/7/13181228598.jpg" #图片地址
root = "C://Users//123//Desktop//新建文件夹//" #保存的根目录
path = root + url.split('/')[-1] #保存的地址
try:
if not os.path.exists(root): #判断根目录是否存在
os.mkdir(root)
if not os.path.exists(path): #如果文件不存在就爬取并保存
r = requests.get(url)
with open(path,'wb') as f: #'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
f.write(r.content) #content返回二进制数据,所以使用'wb'
f.close()
print("文件保存成功")
else:
print("文件已存在")
except:
print("爬取失败")
from bs4 import BeautifulSoup
import requests
r = requests.get("http://python123.io/ws/demo.html")
#print(r.text)
demo = r.text #获取网站的返回文本
soup = BeautifulSoup(demo,"html.parser") #使用bs4的HTML解析器"html.parser"解析demo的内容
print(soup.prettify()) #输出解析后的结果友好输出.prettify()
tag = soup.title #soup.<tag>形式输出指定标签中的全部内容
print(tag) #输出title标签
print(tag.parent.name,tag.parent.parent.name) #name显示标签,parent便签的父亲
tag2 = soup.a
print(tag2.attrs) #attrs显示标签的属性
print(tag2.attrs['class']) #获取标签属性中的class属性的值
print(tag2.attrs['href']) #获取标签属性中的href属性的值
print(type(tag2.attrs)) #标签类型
print(type(tag2))
print(soup.a.string) #获取标签内的字符串内容(可以跨越标签)
print(soup.p.string)
#运行结果
#<html>
# <head>
# <title>
# This is a python demo page
# </title>
# </head>
# <body>
# <p class="title">
# <b>
# The demo python introduces several python courses.
# </b>
# </p>
# <p class="course">
# Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
# Basic Python
# </a>
# and
# <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
# Advanced Python
# </a>
# .
# </p>
# </body>
# </html>
#
# <title>This is a python demo page</title>
# head html
# {'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
# ['py1']
# http://www.icourse163.org/course/BIT-268001
# <class 'dict'>
# <class 'bs4.element.Tag'>
# Basic Python
# The demo python introduces several python courses.
#
# Process finished with exit code 0
下行遍历
import requests
from bs4 import Beautifulsoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
print(len(soup.body.contents)) #获取body标签中的儿子标签的个数
print(soup.body.contents[0]) #通过下标访问body标签中的儿子标签
for child in soup.body.children: #遍历儿子结点
print(child)
#<head><title>This is a python demo page</title></head>
# [<title>This is a python demo page</title>]
# ['\n', <p class="title"><b>The demo python introduces several python courses.</b></p>, '\n', <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>, '\n']
# 5
#
#
#
#
# <p class="title"><b>The demo python introduces several python courses.</b></p>
#
#
# <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
#
#
#
# Process finished with exit code 0
上行遍历
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.parent)
print(soup.html.parent)
print(soup.parent) #空
for parent in soup.a.parents:
if parent is None:
print(parent)
else:
print(parent.name)
平行遍历
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling) #next_sibling返回按照HTML文本顺序的上一个平行节点标签
print(soup.a.previous_sibling.previous_sibling)
基于bs4的相关查找
import requests
import re
from bs4 import BeautifulSoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
# print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
# print(soup.a.next_sibling.next_sibling)
# print(soup.a.previous_sibling) #next_sibling返回按照HTML文本顺序的上一个平行节点标签
# print(soup.a.previous_sibling.previous_sibling)
print(soup.find_all(['a','b'])) #查询a和b标签
for link in soup.find_all('a'): #查找所有的a标签
print(link)
for tag in soup.find_all(True):
print(tag.name)
for tag in soup.find_all(re.compile('b')): #使用正则表达式查找b开头的标签
print(tag.name)
print(soup.find_all('p','course')) #查找标签中属性值为course字符串的的p标签
print(soup.find_all(id = 'link1')) #查找id='link1'的标签元素
print(soup.find_all(id = re.compile('link')))#使用正则表达式查找id值为link开头的标签信息
print(soup.find_all(string = 'Basic Python')) #string参数用于检索<>...</>中字符串区域的检索字符串
print(soup.find_all(string = re.compile('Python'))) #使用正则表达式以及string参数检索所有含有Python字符串的字符串区域
print(soup('a') == soup.find_all('a')) #True
`