python写爬虫思路_Python 利用Python编写简单网络爬虫实例1

利用Python编写简单网络爬虫实例1

by:授客

QQ:1033553122

实验环境

python版本:3.3.5(2.7下报错

实验目的

学习研究,获取网站中特定url

基本思路:

1)给定一个初始URL(入口URL)

2)下载初始页面,解析并抓取初始页面中特定的URL,同时还可以对已分析数据进行保存

3)根据实际情况,决定是否对抓取的URL进行进一步过滤,筛选

4)循环,将过滤后的每个URL当作初始url再次抓取

这里需要对已抓取的URL做判断,以免重复抓取,即需要保存已抓取记录,便于后续判断。

python脚本

#!/usr/bin/env python

# -*- coding:utf-8 -*-

from urllib.request import *

import gzip, re

from io import BytesIO

from html.parser import HTMLParser

# 爬虫类

class Reptile:

"""to download web pages"""

def __init__(self, filepath="d:/url.txt"):

self.url_set = set() # 用于存储已下载过的页面url

self.filepath = filepath # 目标url文件路径

self.data = ""

def get_page(self, url, headers):

request = Request(url, headers=headers)

request.add_header('Accept-encoding', 'gzip') #下载经过gzip方式压缩后的网页,减少网络流量

try:

response = urlopen(request) # 发送请求报文

if response.code == 200: # 请求成功

page = response.read() # 读取经压缩后的页面

if response.info().get("Content-Encoding") == "gzip":

page_data = BytesIO(page)

gzipper = gzip.GzipFile(fileobj = page_data)

self.data = gzipper.read()

else:

print("gzip unused")

self.data = page_data # 网页未采用gzip方式压缩,使用原页面

except Exception:

pass

self.url_set.add(url)

# 及时保存目标url

with open(self.filepath, "a") as f:

flag =

re.findall("http://bbs.51testing.com/treasure/treasure.php[?]trenum=[0-9][0-9][0-9][0-9][0-9]",

url)

if flag:

f.write(url)

f.write("\n")

return self.data

# 获取种子URL

def get_url_seed(self, url_set, include, exclusive):

url_seed_set = set() # 存放相同服务器下的url

seed_set = set() # 存放最后过滤的url

# 过滤不属于当前服务器下的url

while len(url_set) != 0:

url = url_set.pop()

if re.findall(include, url):

url_seed_set.add(url)

# 进一步过滤不想要的url

has_exclusion = False # 用于判断是否进一步过滤

if exclusive != "":

has_exclusion

=

True

while len(url_seed_set) != 0:

url = url_seed_set.pop()

if re.findall(exclusive, url) == []:

seed_set.add(url)

return seed_set

else:

return

url_seed_set

# 筛选种子url(过滤掉已经抓取过的url)

def filter_seed_url(self, url_set):

result_set = url_set - self.url_set

return result_set

# 解析器类

class MyHtmlParser(HTMLParser):

def reset(self):

HTMLParser.reset(self) # 注意顺序

self.url_set = set()

def handle_starttag(self, tag, attrs):

#self.url = []

url_list = [value for key, value in attrs if "href" ==

key]

if url_list:

for url in url_list:

self.url_set.add(url)

##############测试################

# 添加头域,伪装浏览器访问网站,防止一些网站拒绝爬虫访问

headers = {'User-Agent':"Mozilla/5.0 (Windows NT 6.1;

WOW64; rv:33.0) Gecko/20100101 Firefox/33.0"}

init_url = "http://bbs.51testing.com/"

# 构造解析器

parser = MyHtmlParser(strict = False)

# 下载网页

page_number = 1

print("program is downloading the frist url

page")

reptile = Reptile()

page = reptile.get_page(init_url, headers)

print("processing the %dth url page" %

page_number)

# 解析网页(获取url)

parser.feed(str(page))

# 获取种子url

homepage = "http://bbs.51testing.com/"

exclusion =

"mod=login|card.php|archiver|mod=viewthread|[.]css|[.]js|[.]gif"

"|.jpg|about[.]php|panel[.]php|[.]swf|search[.]php"

exclusive = re.compile(exclusion)

include = re.compile(homepage)

url_seed_set = reptile.get_url_seed(parser.url_set,

include, exclusive)

# 过滤种子url

result_set =

reptile.filter_seed_url(url_seed_set)

print("complete")

# 循环

if_continue = "yes"

while if_continue in ("yes", "", "YES", "Yes"):

if_continue = input("if continue another grab, input 'yes' or click

Enter to continue,"

"anything else to exit: ")

if if_continue not in ("yes", "", "Yes", "YES"):

break

deep = input("the level you want reptile to try: ")

if deep.isdigit() == False:

print("value must be a number, try gain")

continue

else:

level = int(deep)

num = 0

i = 0

total_set = set()

while i < level:

print("*****************parsing url pages on the %dth

level*****************" % (i+1))

for url in result_set:

print("program is processing the %dth url" %

(page_number+1))

page = reptile.get_page(url, headers)

parser.feed(str(page))

url_seed_set = reptile.get_url_seed(parser.url_set, include,

exclusive)

result_set_tmp = reptile.filter_seed_url(url_seed_set)

total_set = total_set ^ result_set_tmp

page_number = page_number + 1

result_set = total_set

if result_set:

i = i + 1

else:

break

print("complete")

结果:

sg_trans.gif

声明:仅供学习研究使用,请勿用于其它非法用途

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值