python爬虫request+beautifulsoup（正则表达式、CSS选择器）或xpath

最新推荐文章于 2024-08-29 11:00:00 发布

qq_40707462

最新推荐文章于 2024-08-29 11:00:00 发布

阅读量673

点赞数

文章标签： python

本文链接：https://blog.csdn.net/qq_40707462/article/details/105656564

版权

一、request获取网页全部信息和源代码，之后用beautisoup提取所需信息

#法1：urllib.requset
from urllib import response
from urllib.request import urlopen
request=urllib.request.Request(url,headers=head)
    html=""
    try:
        response=urllib.request.urlopen(request)
        html=response.read().decode("utf-8")
        print(html)
    except urllib.request.URLError as e:
        if hasattr(e,"code"):
            print(e.code)
        if hasattr(e,"reason"):
            print(e.reason)
    print(html)

#法二：requests
import requests
        html = requests.get(url,headers = header,timeout = timeout)
        html.encoding = 'utf-8'

二、解析网页源代码寻找所需数据 BeautifulSoup

from bs4 import BeautifulSoup
import re 
bs=BeautifulSoup(html,"html.parser")

#搜索第一个
bs.title
bs.a    #第一个<a  /a>之间所有内容
bs.a.string    #去掉前后标签，只要中间字符串内容
bs.a.attrs     #字典方式打印标签中的内容

#搜索全部
#1、字符串搜索
list=bs.find_all("a")  #所有a标签放入一个列表，完全匹配
list=bs.find_all("a"，limit=3)#搜索3个a
list=bs.find_all(id="head") 
list=bs.find_all('div',class_="item"):  #div中，class=item 

#2、正则表达式搜索
list=bs.find_all(re.compile("a")#标签中含有a字母的全部内容
list=bs.find_all(text=re.compile("\d"))#寻找文本中有数字的字符串

#3.用函数搜索，自己定义
def name_is_exists(tag):
    return tag.has_attr("name")
list=bs.find_all(name_is_exists)  #含有name的标签

#css选择器
list=bs.select('title')  #title是标签名
list=bs.select('.mnav')  #.mnav是类名 如：<a class="mnav"></a>
list=bs.select("#u1")   #id名 如：<div id="u1">
list=bs.select("a[class='mnav']")  #属性查找
list=bs.select("head>title")    #head里的title标签 子标签查找

#显示
for item in list:
    print(item)
    print(item["href"])  #取出其中的超链接
    print(item.text.strip())    #取出全部文字，去掉空格

print(list[0].get_text())

三、选择器示例
在这里插入图片描述
要拿到选中的href
看最下面一行的标签 #resulList–>.el–>.t1–>span–>a

#拿到选中a标签内所有内容
result=bs.select(".el>.t1>span>a")
#拿到具体内容
for link in result:
    print(link["href"])
    print(link["title"])
    print(link.text.strip())#全部文字 去掉空格

或：

res=response.css(".div.quote.text::text").extract
res2=response.css(".div.quote a::attr[href]").extract_first

或者在chrome里使用CTRL+f可以检查路径
在这里插入图片描述
四、xpath

from lxml import etree
r = requests.get(url)
r.encoding = r.apparent_encoding
#解析网页
selector = etree.HTML(r.text)

content=selector.xpath('//div[@id="test"]/li/text()')
link=selector.xpath('//a/@hreh')
title=selector.xpath('//a/@title')