python爬取网页上的特定链接_Python抓取指定网页以及该网页上所有链接

#!/usr/bin/env python

# -*- coding: utf-8 -*-

# ****************************************************************************

# Copyright (C) 2010 yangyingchao@gmail.com

# Author: yangyingchao

# This program is free software; you can redistribute it and/or modify it

# under the terms of the GNU General Public License as published by the Free

# Software Foundation; either version 2, or (at your option) any later

# version.

# This program is distributed in the hope that it will be useful, but WITHOUT

# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or

# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for

# more details.

# You should have received a copy of the GNU General Public License along with

# GNU Emacs; see the file COPYING. If not, write to the Free Software

# Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.

# ****************************************************************************

from copy import deepcopy

from sgmllib import SGMLParser

from xml.dom.minidom import *

import os

import re

import sys

import urllib2

title = "Untitled"

class MyParser(SGMLParser):

def __init__(self):

self.data = ""

self.links = []

self.TAG_BEG = False

self.TAG_END = False

SGMLParser.__init__(self, 0)

def handle_data(self, data):

if (self.TAG_BEG is True) and (self.TAG_END is False):

self.data += data

pass

def start_title(self, attrs):

self.link = ""

self.data=""

self.TAG_BEG = True

self.TAG_END = False

for (key, val) in attrs:

if key == "href":

self.link = val

def end_title(self):

self.TAG_BEG = False

self.TAG_END = True

self.title = self.data.strip()

def flush(self):

pass

def handle_comment(self, data):

pass

def start_a(self, attrs):

self.data=""

self.TAG_BEG = True

self.TAG_END = False

for (key, val) in attrs:

if key == "href":

self.link = val

def end_a(self):

self.TAG_BEG = False

self.TAG_END = True

tmp = {}

tmp["name"] = self.data

tmp["link"] = self.link

self.links.append(deepcopy(tmp))

def unknown_starttag(self, tag, attrs):

pass

def unknown_endtag(self, tag):

pass

def unknown_entityref(self, ref):

pass

def unknown_charref(self, ref):

pass

def unknown_decl(self, data):

pass

def close(self):

SGMLParser.close(self)

self.flush()

def lst2str(lst):

string = ""

for item in lst:

string += item.strip()+ "\n"

return string

def downURL(url, filename):

print "Download %s, save as %s"%(url, filename)

try:

fp = urllib2.urlopen(url)

except:

print "download exception"

print sys.exc_info()

return 0

op = open(filename, "wb")

while 1:

s = fp.read()

if not s:

break

op.write(s)

fp.close( )

op.close( )

return 1

def reptile(base_url):

"""

Download all articles from base_url.

Arguments:

- `base_url`: Url of website.

"""

page_list = []

if not len(base_url):

print "No page to reptile!"

sys.exit(1)

parser = MyParser()

if base_url.startswith("http"):

myopen = urllib2.urlopen

else:

myopen = open

try:

content = myopen(base_url).read()

except:

print "Failed to read from %s."%base_url

print sys.exc_info()

for item in content:

parser.feed(item)

for tmp in parser.links:

page_list.append(tmp.get("link"))

global title

title = parser.title

parser.close()

item_list = list(set(page_list))

for item in item_list:

# Strip '#' from url.

pos = item.find('#')

if pos != -1:

item = item[:pos]

# Added base_url to item if necessary

if not item.startswith("http"):

item = base_url.rstrip("/")+"/"+item

pass

local_file = item.split("/")[-1]

print item, local_file

if not local_file:

print "Empty local file! Continue from next one!"

continue

if os.access(local_file, os.F_OK):

print "File: %s existed, skip ..."%local_file

else:

ret = downURL(item, local_file)

# Remember to download the index file!

downURL(base_url, "index.html")

print "Total: %d articles."%(len(item_list))

pass

def walk_dir(lst, dirname, filenames):

for filename in filenames:

fn = os.path.join(dirname, filename)

if os.path.isdir(fn) or \

not filename.endswith("html"):

continue

print "Processing: %s"%fn

tmp = {}

parser = MyParser()

content = open(fn).read()

for item in content:

parser.feed(item)

tmp["file"] = filename

tmp["title"] = parser.title

parser.close()

lst.append(deepcopy(tmp))

pass

def gen_index():

"""

Generate index of all htmls in this directory.

"""

file_lists = []

os.path.walk(".", walk_dir, file_lists)

fp = open("%s.devhelp2"%os.path.basename(os.getcwd()), "w")

string = '<?xml version="1.0" encoding="utf-8"?>\n

' language="c" link="index.html" name="" title="%s"'%title+\

' version="2" xmlns="http://www.devhelp.net/book">\n '

for item in file_lists:

link = item.get("file")

try:

name =item.get("title").decode('gbk').encode('utf-8')

except:

name = item.get("title")

finally:

string += '\n'%(link, name)

string += '\n\n

\n'

fp.write(string)

fp.close()

if __name__ == '__main__':

if len(sys.argv) != 2:

print "Usage: %s url of baidu space"%sys.argv[0]

print "Such as: %s http://hi.baidu.com/Username"

gen_index()

sys.exit(1)

base_url = sys.argv[1]

reptile (base_url)

gen_index()

标签:

版权申明:本站文章部分自网络,如有侵权,请联系:west999com@outlook.com

特别注意:本站所有转载文章言论不代表本站观点!

本站所提供的图片等素材,版权归原作者所有,如需使用,请与原作者联系。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值