【Python网络爬虫】python网络数据采集读书笔记(第三章)

python网络数据采集

第三章 开始采集

demo1

遍历单个域名

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsobj=BeautifulSoup(html)
for link in bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")):
    if 'href' in link.attrs:
        print(link.attrs['href'])

print(type(bsobj.find('div',{'id':'bodyContent'})))  #<class 'bs4.element.Tag'>
print(type(bsobj.findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>
print(type(bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$")))) #<class 'bs4.element.ResultSet'>

demo2

使用getlinks函数来获取url

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
    bsobj=BeautifulSoup(html)
    return bsobj.find('div',{'id':'bodyContent'}).findAll('a',href=re.compile("^(/wiki/)((?!:).)*$"))

if __name__=='__main__':
    random.seed ( datetime.datetime.now ( ) )
    links=getLinks("/wiki/Kevin_Bacon")
    while len(links)>0:
        newArticle=links[random.randint(0,len(links)-1)].attrs['href']
        print(newArticle)
        links=getLinks(newArticle)

demo3

将获取到的url保存到数据库中,自己提前建立好数据库

import mysql.connector  #导入模块
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

def insertdatabase(url):
    # 设置连接属性
    config = {
        'host'     : '127.0.0.1' ,
        'user'     : 'root' ,
        'password' : '1234' ,
        'port'     : '3306' ,
        'database' : 'url_save' ,
        'charset'  : 'utf8'  # utf8  没有-
    }
    database = mysql.connector.connect ( **config )
    cur=database.cursor()    # 获取连接的cursor
    cur.execute("create table if not EXISTS url_table_"+url[2:5]+"(ID int PRIMARY KEY auto_increment,URL VARCHAR(200));")
    cur.execute("insert into url_table_"+url[2:5]+"(URL) VALUES('%s')"%url)    #此处url不能写入到数据库中,values中不要忘记加单引号
    print('写入成功')
    database.commit()        #不能忘记提交
    cur.close()
    database.close()

def getLinks ( articleUrl ) :
        html = urlopen ( "http://en.wikipedia.org/wiki/Kevin_Bacon" )
        bsobj = BeautifulSoup ( html )
        return bsobj.find ( 'div' , { 'id' : 'bodyContent' } ).findAll ( 'a' ,href = re.compile ( "^(/wiki/)((?!:).)*$" ) )

if __name__ == '__main__' :
    random.seed ( datetime.datetime.now ( ) )
    links = getLinks ( "/wiki/Kevin_Bacon" )

    while len ( links ) > 0 :
        newArticle = links [ random.randint ( 0 , len ( links ) - 1 ) ].attrs [ 'href' ]
        print(newArticle)
        insertdatabase(newArticle)
        links = getLinks ( newArticle )

demo4

链接去重

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re

pages=set()
def getLinks(pageUrl):
    global pages
    html=urlopen('http://en.wikipedia.org'+pageUrl)
    bsobj=BeautifulSoup(html)
    try:
        print(bsobj.h1.get_text())
        print(bsobj.find(id='mw-content-text').findAll('p')[0].get_text())
        print(bsobj.find(id='ca-edit').find("span").find('a').attrs['href'])
    except AttributeError:
        print("缺少一些属性")
    for link in bsobj.findAll('a',href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
                newPage=link.attrs['href']
                print('**************************************\n'+newPage)
                pages.add(newPage)
                getLinks(newPage)
getLinks("")

demo5

几个功能的组合,可以定向的获取內链或者外链,但是在运行中走了一会就提示403错误了。

from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
import datetime
import random

pages=set()
allExtLinks=set()
allIntLinks=set()
random.seed(datetime.datetime.now())

#获取页面内所有內链的列表
def getInternalLinks(bsobj,includeUrl):
    internalLinks=[]
    #找出所有以“/”开头的连接
    for link in bsobj.findAll('a',href=re.compile("^(/|.*"+includeUrl+")")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in internalLinks:
                if link.attrs['href'].startswith("/"):
                    internalLinks.append(link.attrs['href'])
                else:
                    internalLinks.append(link.attrs['href'])
    return internalLinks
#获取页面内所有外链的列表
def getExternalLinks(bsobj,excludeUrl):
    externalLinks=[]
    #找出所有以“http”或者“www”开头并且不包含当前url的链接
    for link in bsobj.findAll('a',href=re.compile("^(http|www)((?!"+excludeUrl+").)*$")):
        if link.attrs['href'] is not None:
            if link.attrs['href'] not in externalLinks:
                externalLinks.append(link.attrs['href'])
    return externalLinks
def splitAddress(address):
    addressParts=address.replace("http://","").split("/")
    return addressParts
def getRandomExternalLink(startingPage):
    html=urlopen(startingPage)
    bsobj=BeautifulSoup(html)
    externalLinks=getExternalLinks(bsobj,splitAddress(startingPage)[0])
    if len(externalLinks)==0:
        internalLinks=getInternalLinks(bsobj,startingPage)
        return getRandomExternalLink(internalLinks[random.randint(0,len(internalLinks)-1)])
    else:
        return externalLinks[random.randint(0,len(externalLinks)-1)]
def followExternalOnly(startingSite):
    externalLink=getRandomExternalLink(startingSite)
    print("随机外链是:"+externalLink+"\n")
    followExternalOnly(externalLink)

followExternalOnly("http://oreilly.com")

后面关于Scrapy的内容,由于仅支持python2.7版本,所以就先放一放

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值