【Python网络爬虫】python网络数据采集读书笔记（第三章）

最新推荐文章于 2022-11-22 15:48:12 发布

Tag_sk

最新推荐文章于 2022-11-22 15:48:12 发布

阅读量804

点赞数 1

CC 4.0 BY-SA版权

分类专栏： Python爬虫文章标签： python html 读书笔记网络爬虫

本文链接：https://blog.csdn.net/github_35746658/article/details/53889074

Python爬虫专栏收录该内容

3 篇文章

订阅专栏

本文通过实例演示如何使用Python进行网络数据采集，包括遍历单个域名下的链接、使用BeautifulSoup解析网页、链接去重及将数据保存到数据库等关键技术。

python网络数据采集

第三章开始采集

demo1

遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsobj=BeautifulSoup(html)
for link in bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

print(type(bsobj.find('div',{'id':'bodyContent'})))  #<class 'bs4.element.Tag'>
print(type(bsobj.findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>
print(type(bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>

demo2

使用getlinks函数来获取url

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
    bsobj=BeautifulSoup(html)
    return bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$"))

if __name__=='__main__':
    random.seed ( datetime.datetime.now ( ) )
    links=getLinks("/wiki/Kevin_Bacon")
    while len(links)>0:
        newArticle=links[random.randint(0,len(links)-1)].attrs['href']
        print(newArticle)
        links=getLinks(newArticle)

demo3

将获取到的url保存到数据库中，自己提前建立好数据库

import mysql.connector  #导入模块
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

def insertdatabase(url):
    # 设置连接属性
    config = {
        'host'     : '127.0.0.1' ,
        'user'     : 'root' ,
        'password' : '1234' ,
        'port'     : '3306' ,
        'database' : 'url_save' ,
        'charset'  : 'utf8'  # utf8  没有-
    }
    database = mysql.connector.connect ( **config )
    cur=database.cursor()    # 获取连接的cursor
    cur.execute("create table if not EXISTS url_table_"+url[2:5]+"(ID int PRIMARY KEY auto_increment,URL VARCHAR(200));")
    cur.execute("insert into url_table_"+url[2:5]+"(URL) VALUES('%s')"%url)    #此处url不能写入到数据库中,values中不要忘记加单引号
    print('写入成功')
    database.commit()        #不能忘记提交
    cur.close()
    database.close()

def getLinks ( articleUrl ) :
        html = urlopen ( "http://en.wikipedia.org/wiki/Kevin_Bacon" )
        bsobj = BeautifulSoup ( html )
        return bsobj.find ( 'div' , { 'id' : 'bodyContent' } ).findAll ( 'a' ,href = re.compile ( "^(/wiki/)((?!:).)*$" ) )

if __name__ == '__main__' :
    random.seed ( datetime.datetime.now ( ) )
    links = getLinks ( "/wiki/Kevin_Bacon" )

    while len ( links ) > 0 :
        newArticle = links [ random.randint ( 0 , len ( links ) - 1 ) ].attrs [ 'href' ]
        print(newArticle)
        insertdatabase(newArticle)
        links = getLinks ( newArticle )

demo4

链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getLinks(pageUrl):
    global pages
    html=urlopen('http://en.wikipedia.org'+pageUrl)
    bsobj=BeautifulSoup(html)
    try:
        print(bsobj.h1.get_text())
        print(bsobj.find(id='mw-content-text').findAll('p')[0].get_text())
        print(bsobj.find(id='ca-edit').find("span").find('a').attrs['href'])
    except AttributeError:
        print("缺少一些属性")
    for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage=link.attrs['href']
                print('**************************************\n'+newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")

demo5

几个功能的组合，可以定向的获取內链或者外链，但是在运行中走了一会就提示403错误了。

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages=set()
allExtLinks=set()
allIntLinks=set()
random.seed(datetime.datetime.now())

#获取页面内所有內链的列表
def getInternalLinks(bsobj,includeUrl):
    internalLinks=[]
    #找出所有以“/”开头的连接
    for link in bsobj.findAll('a',href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if link.attrs['href'].startswith("/"):
                    internalLinks.append(link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
#获取页面内所有外链的列表
def getExternalLinks(bsobj,excludeUrl):
    externalLinks=[]
    #找出所有以“http”或者“www”开头并且不包含当前url的链接
    for link in bsobj.findAll('a',href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks
def splitAddress(address):
    addressParts=address.replace("http://","").split("/")
    return addressParts
def getRandomExternalLink(startingPage):
    html=urlopen(startingPage)
    bsobj=BeautifulSoup(html)
    externalLinks=getExternalLinks(bsobj,splitAddress(startingPage)[0])
    if len(externalLinks)==0:
        internalLinks=getInternalLinks(bsobj,startingPage)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
    externalLink=getRandomExternalLink(startingSite)
    print("随机外链是："+externalLink+"\n")
    followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")