编写Spider.py

# -*- coding:utf-8 -*-



# urllib库

from urllib.request import urlopen
# 从urllib库的requests模块导入urlopen函数
html = urlopen("http://pythonscraping.com/pages/page1.html")
# 抓取信息
print(html.read())
# 读取信息并打印



# BeautifulSoup库

from bs4 import BeautifulSoup
# 从bs4库加载BeautifulSoup模块
from urllib.request import urlopen
# 从urllib库的requests模块导入urlopen函数
html = urlopen("http://www.pythonscraping.com/pages/page1.html")
# 抓取信息
bsObj = BeautifulSoup(html.read())
# 分析读取的信息并打印
print(bsObj.h1)
# 打印html->body->h1标签



# 网页在服务器上不存在的异常

try:
    html = urlopen("http://www.pythonscraping.com/pages/page1.html")
    except HTTPError as e:
        print(e)
        # 返回空值,中断程序,或者执行另一个方案
    else:
    #  程序继续。注意:如果你已经在上面异常捕捉那一段代码里返回或中断(break),
    #  那么就不需要使用else语句了,这段代码也不会执行

# 判断抓取链接网页的返回是否为空

if html is None:
    print("URL is not found")
else:
    # 程序继续

try:
    badContent = bsObj.nonExistingTag.anotherTag
except AttributeError as e:
    print("Tag was not found")
else:
    if badContent == None:
        print("Tag was not found")
    else:
        print(badContent)



from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup
def getTitle(url):  # 获取网页标题的函数
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    try:
        bsObj = BeautifulSoup(html.read())
        title = bsObj.body.h1
    except AttributeError as e:
        return None
    return title
title = getTitle("http://www.pythonscraping.com/pages/page1.html")
if title == None:
    print("Title could not be found")
else:
    print(title)



# 遍历单个域名
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen("http://en.wikipedia.org/wiki/Kevin_Bacon")
bsObj = BeautifulSoup(html)
for link in bsObj.findAll("a"):
    if 'href' in link.attrs:
        print(link.attrs['href'])



from urllib.request import urlopen
from bs4 import BeautifulSoup
import datetime import random
import re
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
    html = urlopen("http://en.wikipedia.org"+articleUrl)
    bsObj = BeautifulSoup(html)
    return bsObj.find("div", {"id":"bodyContent"}).findAll("a",
                                                           href=re.compile("^(/wiki/)((?!:).)*$"))
links = getLinks("/wiki/Kevin_Bacon")
while len(links) > 0:
    newArticle = links[random.randint(0, len(links)-1)].attrs["href"]
    print(newArticle)
    links = getLinks(newArticle)



# 链接去重
from urllib.request import urlopen
from bs4 import BeautifulSoup
import re
pages = set()
def getLinks(pageUrl):
    global pages
    html = urlopen("http://en.wikipedia.org"+pageUrl)
    bsObj = BeautifulSoup(html)
    for link in bsObj.findAll("a", href=re.compile("^(/wiki/)")):
        if 'href' in link.attrs:
            if link.attrs['href'] not in pages:
            # 我们遇到了新页面
            newPage = link.attrs['href']
            print(newPage)
            pages.add(newPage)
            getLinks(newPage)
getLinks("")
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值