最近为了更好的查询老王python的外链,所以准备写一个python urllib2 查询脚本来查询,一般查询外链比较准确的工具还是yahoo的外链工具,但是有点不方便的就是,yahoo查出的外链要一页一页的翻,好累而且不好方便统计,我是想把网站的外链全部读取到文件里,这样比较好在本地来进行统计。
废话不说,上脚本吧。
#encoding=utf-8
#@author:老王python
#@description:检查网站的外链
import urllib
from sgmllib import SGMLParser
import re
import sys
infolist = {} #结果列表
class LinkParser( SGMLParser ):
'''抓取link列表的信息'''
def reset( self ):
SGMLParser . reset( self)
self . url = '' #链接文本
self . li_check = 0
self . a_check = 0
self . jt_url = '' #具体url
self . infolist = {}
def start_li( self , tag ):
'''检测li开头'''
if tag :
if tag [ 0 ][ 1 ] == 'sco5li0' or tag [ 0 ][ 1 ] == 'sco5li1' :
self . li_check = 1
def start_a( self , tag ):
'''检测a开头'''
if self . li_check == 1 :
if not tag [ 0 ][ 1 ] . startswith( 'http://203.209.253.250' ):
host_re = re . compile( r'^https?://(.*?)($|/)' ,
re . IGNORECASE
)
self . url = host_re . search( tag [ 0 ][ 1 ]) . group( 1)
self . jt_url = tag [ 0 ][ 1 ]
self . a_check = 1
def handle_data( self , text ):
'''处理空白文本'''
txt = text . strip()
if txt and self . a_check and txt != '快照' :
checkurl = ' %s , %s ' % ( self . url , self . jt_url)
self . infolist [ checkurl ] = txt
if txt == '' :
return
def end_li( self ):
self . li_check = 0
def end_a( self ):
self . a_check = 0
numre = re . compile( r'<strong>.+') #匹配总的记录数
pnum = re . compile( r'\d+')
checkurl = ''#查询网站的地址,比如http://www.xxx.com
checkurl = urllib . quote( checkurl) #请求地址
pageurl = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s ' % ( checkurl)
content = urllib . urlopen( pageurl) . read()
c = numre . search( content) . group( 0)
totalnum = int( pnum . search( c) . group( 0)) #总的外链数
host_re = re . compile( r'^http://(?P<host>www\.(?:[A-Z0-9-]+\.){1}[A-Z\.]{2,6})$' ,
re . IGNORECASE
)
pagesize = 50 #一页显示50条
if totalnum % pagesize :
page = totalnum / pagesize
else :
page = ( totalnum / pagesize) + 1
f = file( 'a.txt' , 'w')
for k in xrange( page ):
parser = LinkParser()
url = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s &b= %s ' % ( checkurl , k * 50)
print 'url=========>' , url
cstr = urllib . urlopen( url) . read()
parser . feed( cstr)
parser . close()
for m in parser . infolist :
domain , jt_url = m . split( ',')
print 'domain--------->' , domain
print 'jt_url--------->' , jt_url
t = 'url: %s ,jt_url: %s ,title: %s \n ' % ( domain , jt_url , parser . infolist [ m ] . decode( 'utf-8') . encode( 'utf-8'))
f . write( t)
f . close()
#@author:老王python
#@description:检查网站的外链
import urllib
from sgmllib import SGMLParser
import re
import sys
infolist = {} #结果列表
class LinkParser( SGMLParser ):
'''抓取link列表的信息'''
def reset( self ):
SGMLParser . reset( self)
self . url = '' #链接文本
self . li_check = 0
self . a_check = 0
self . jt_url = '' #具体url
self . infolist = {}
def start_li( self , tag ):
'''检测li开头'''
if tag :
if tag [ 0 ][ 1 ] == 'sco5li0' or tag [ 0 ][ 1 ] == 'sco5li1' :
self . li_check = 1
def start_a( self , tag ):
'''检测a开头'''
if self . li_check == 1 :
if not tag [ 0 ][ 1 ] . startswith( 'http://203.209.253.250' ):
host_re = re . compile( r'^https?://(.*?)($|/)' ,
re . IGNORECASE
)
self . url = host_re . search( tag [ 0 ][ 1 ]) . group( 1)
self . jt_url = tag [ 0 ][ 1 ]
self . a_check = 1
def handle_data( self , text ):
'''处理空白文本'''
txt = text . strip()
if txt and self . a_check and txt != '快照' :
checkurl = ' %s , %s ' % ( self . url , self . jt_url)
self . infolist [ checkurl ] = txt
if txt == '' :
return
def end_li( self ):
self . li_check = 0
def end_a( self ):
self . a_check = 0
numre = re . compile( r'<strong>.+') #匹配总的记录数
pnum = re . compile( r'\d+')
checkurl = ''#查询网站的地址,比如http://www.xxx.com
checkurl = urllib . quote( checkurl) #请求地址
pageurl = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s ' % ( checkurl)
content = urllib . urlopen( pageurl) . read()
c = numre . search( content) . group( 0)
totalnum = int( pnum . search( c) . group( 0)) #总的外链数
host_re = re . compile( r'^http://(?P<host>www\.(?:[A-Z0-9-]+\.){1}[A-Z\.]{2,6})$' ,
re . IGNORECASE
)
pagesize = 50 #一页显示50条
if totalnum % pagesize :
page = totalnum / pagesize
else :
page = ( totalnum / pagesize) + 1
f = file( 'a.txt' , 'w')
for k in xrange( page ):
parser = LinkParser()
url = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s &b= %s ' % ( checkurl , k * 50)
print 'url=========>' , url
cstr = urllib . urlopen( url) . read()
parser . feed( cstr)
parser . close()
for m in parser . infolist :
domain , jt_url = m . split( ',')
print 'domain--------->' , domain
print 'jt_url--------->' , jt_url
t = 'url: %s ,jt_url: %s ,title: %s \n ' % ( domain , jt_url , parser . infolist [ m ] . decode( 'utf-8') . encode( 'utf-8'))
f . write( t)
f . close()
原创文章请注明转载自老王python,本文地址:http://www.cnpythoner.com/post/121.html
作者:老王@python python 教程
老王python,提供python相关的python 书籍,python 主机,django 教程和python 下载,希望大家能够喜欢!