爬虫--requests、BeautifulSoup入门

1、通过关键字进行搜索

import requests
#百度关键词搜索API: https://www.baidu.com/s?wd=keyword
kv = {'wd':'Python'}
header = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9'} 
#添加Accept跳过百度验证
r = requests.get("https://www.baidu.com/s",headers = header,params = kv)
print(r.request.url)
print(r.request.headers)
print(r.status_code,r.encoding,len(r.text))
r.encoding = 'utf-8'
print(r.text)

2、爬取网络图片将其保存到本地

import os
import requests
url = "https://www.zhifure.com/upload/images/2018/7/13181228598.jpg"   #图片地址
root = "C://Users//123//Desktop//新建文件夹//" #保存的根目录
path = root + url.split('/')[-1]  #保存的地址

try:
    if not os.path.exists(root): #判断根目录是否存在
        os.mkdir(root)
    if not os.path.exists(path):  #如果文件不存在就爬取并保存
        r = requests.get(url)
        with open(path,'wb') as f: #'wb'以二进制格式打开一个文件只用于写入。如果该文件已存在则将其覆盖。如果该文件不存在,创建新文件。
            f.write(r.content)     #content返回二进制数据,所以使用'wb'
            f.close()
            print("文件保存成功")
    else:
        print("文件已存在")
except:
    print("爬取失败")

在这里插入图片描述
在这里插入图片描述

from bs4 import BeautifulSoup
import requests

r = requests.get("http://python123.io/ws/demo.html")
#print(r.text)
demo = r.text  #获取网站的返回文本
soup = BeautifulSoup(demo,"html.parser") #使用bs4的HTML解析器"html.parser"解析demo的内容
print(soup.prettify()) #输出解析后的结果友好输出.prettify()
tag = soup.title  #soup.<tag>形式输出指定标签中的全部内容
print(tag) #输出title标签
print(tag.parent.name,tag.parent.parent.name) #name显示标签,parent便签的父亲

tag2 = soup.a
print(tag2.attrs) #attrs显示标签的属性
print(tag2.attrs['class']) #获取标签属性中的class属性的值
print(tag2.attrs['href'])	#获取标签属性中的href属性的值
print(type(tag2.attrs)) #标签类型
print(type(tag2))
print(soup.a.string) #获取标签内的字符串内容(可以跨越标签)
print(soup.p.string)
#运行结果
#<html>
#  <head>
#   <title>
#    This is a python demo page
#   </title>
#  </head>
#  <body>
#   <p class="title">
#    <b>
#     The demo python introduces several python courses.
#    </b>
#   </p>
#   <p class="course">
#    Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
#    <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">
#     Basic Python
#    </a>
#    and
#    <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">
#     Advanced Python
#    </a>
#    .
#   </p>
#  </body>
# </html>
#
# <title>This is a python demo page</title>
# head html
# {'href': 'http://www.icourse163.org/course/BIT-268001', 'class': ['py1'], 'id': 'link1'}
# ['py1']
# http://www.icourse163.org/course/BIT-268001
# <class 'dict'>
# <class 'bs4.element.Tag'>
# Basic Python
# The demo python introduces several python courses.
# 
# Process finished with exit code 0

在这里插入图片描述
下行遍历
在这里插入图片描述

import requests
from bs4 import Beautifulsoup
r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
#print(soup.prettify())
print(soup.head)
print(soup.head.contents)
print(soup.body.contents)
print(len(soup.body.contents)) #获取body标签中的儿子标签的个数
print(soup.body.contents[0])   #通过下标访问body标签中的儿子标签

for child in soup.body.children:  #遍历儿子结点
    print(child)

#<head><title>This is a python demo page</title></head>
# [<title>This is a python demo page</title>]
# ['\n', <p class="title"><b>The demo python introduces several python courses.</b></p>, '\n', <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>, '\n']
# 5
# 
# 
# 
# 
# <p class="title"><b>The demo python introduces several python courses.</b></p>
# 
# 
# <p class="course">Python is a wonderful general-purpose programming language. You can learn Python from novice to professional by tracking the following courses:
# <a class="py1" href="http://www.icourse163.org/course/BIT-268001" id="link1">Basic Python</a> and <a class="py2" href="http://www.icourse163.org/course/BIT-1001870001" id="link2">Advanced Python</a>.</p>
# 
# 
# 
# Process finished with exit code 0

上行遍历
在这里插入图片描述

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.title.parent)
print(soup.html.parent)
print(soup.parent) #空

for parent in soup.a.parents:
    if parent is None:
        print(parent)
    else:
        print(parent.name)

平行遍历
在这里插入图片描述
在这里插入图片描述

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
print(soup.a.next_sibling.next_sibling)
print(soup.a.previous_sibling)  #next_sibling返回按照HTML文本顺序的上一个平行节点标签
print(soup.a.previous_sibling.previous_sibling)

基于bs4的相关查找

在这里插入图片描述

import requests
import re
from bs4 import BeautifulSoup

r = requests.get("http://python123.io/ws/demo.html")
demo = r.text
soup = BeautifulSoup(demo,"html.parser")
# print(soup.a.next_sibling) #next_sibling返回按照HTML文本顺序的下一个平行节点标签
# print(soup.a.next_sibling.next_sibling)
# print(soup.a.previous_sibling)  #next_sibling返回按照HTML文本顺序的上一个平行节点标签
# print(soup.a.previous_sibling.previous_sibling)
print(soup.find_all(['a','b'])) #查询a和b标签
for link in soup.find_all('a'): #查找所有的a标签
    print(link)
for tag in soup.find_all(True):
    print(tag.name)

for tag in soup.find_all(re.compile('b')): #使用正则表达式查找b开头的标签
    print(tag.name)

print(soup.find_all('p','course')) #查找标签中属性值为course字符串的的p标签

print(soup.find_all(id = 'link1')) #查找id='link1'的标签元素

print(soup.find_all(id = re.compile('link')))#使用正则表达式查找id值为link开头的标签信息

print(soup.find_all(string = 'Basic Python')) #string参数用于检索<>...</>中字符串区域的检索字符串

print(soup.find_all(string = re.compile('Python'))) #使用正则表达式以及string参数检索所有含有Python字符串的字符串区域

print(soup('a') == soup.find_all('a'))  #True

`在这里插入图片描述
在这里插入图片描述

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值