BeautifulSoup使用相关知识

1基础使用,获取某一网址内容的h1标签

from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError

def gettitle(url):
    try:
        html=urlopen(url)
    except HTTPError as e:
        return None

    try:
        bsobj=BeautifulSoup(html.read())
        title=bsobj.body.h1
    except AttributeError as e:
        return None

    return title

title=gettitle("http://www.pythonscraping.com/pages/page1.html")
if title==None:
    print('not found')
else:
    print(title)

2复杂HTML解析

  • 通过属性查找标签的方法,标签组的使用,以 及标签解析树的导航过程
from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.error import HTTPError


def gettitle(url):
    try:
        html=urlopen(url)
    except HTTPError as e:
        return None

    try:
        bsobj=BeautifulSoup(html.read())
        namelist=bsobj.findAll('span',{"class":'green'})
        if namelist==None:
            pass
        else:
            for name in namelist:
                print(name.get_text())#get_text()清除标签,只保留内容
    except AttributeError as e:
        return None


gettitle("http://www.pythonscraping.com/pages/warandpeace.html")

find(tag,attributes,recursive,text,keywords)
findAll(tag,attributes,recursive,text,limit,keywords)
tag:标签(dv,h1,h2等等)
attributes:属性
recursive:设置为Flase只会查找一级标签,默认是True,会根据 筛选条件查找所有子标签
text:使用标签文本匹配,返回该字符的标签数量
limit:范围限制
keywords:使用关键词查找(使用class时,应该obj.findAll(class_=”green”)这样使用)

3采集一个网站,有很多界面,并且有部分界面是重复的,需要把以发现的所有链接放到一起,并保存在方便查询的列表里,只有心链接才会被采集(不能爬取有反爬机制的网站,例如知乎,百度)

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def get_links(pageurl):
    global pages
    html=urlopen('http://en.wikipedia.org'+pageurl)
    bsObj=BeautifulSoup(html)
    for link in bsObj.findAll('a',href=re.compile('^(/wiki/)')):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage=link.attrs['href']
                print(newPage)
                pages.add(newPage)
                get_links(newPage)

if __name__=='__main__':
    get_links('')

这里写图片描述

4通过互联网采集:外链

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages=set()
random.seed(datetime.datetime.now())

#获取所有内链的链接
def getInternalLinks(bsObj,includeUrk):
    internalLinks=[]
    for link in bsObj.findAll('a',href=re.compile('^(/|.*'+includeUrk+')')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                internalLinks.append(link.attrs['href'])
    return internalLinks

#获取页面所有外链的链接列表
def getExternalLinks(bsObj,excludeUrl):
    externalLinks=[]
    for link in bsObj.findAll('a',href=re.compile('^(http|wwww|https)((?!'+excludeUrl+').)*$')):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks

def splitAddress(address):
    addressParts=address.replace('http://','').split('/')
    return addressParts

def getRandomExternalLink(startingPage):
    html=urlopen(startingPage)
    bsObj=BeautifulSoup(html)
    externalLinks=getExternalLinks(bsObj,splitAddress(startingPage)[0])
    if len(externalLinks)==0:
        internalLinks=getInternalLinks(startingPage)
        return None
    else:
        return externalLinks[random.randint(0,len(externalLinks)-1)]

def followWExternalOnly(startingSite):
    externalLink=getRandomExternalLink('http://jianshu.com')
    print('随机外链:'+externalLink)
    followWExternalOnly(externalLink)

followWExternalOnly('http://jianshu.com')

这里写图片描述

Python群:298948196

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值