python urllib2查询数据

最新推荐文章于 2024-09-22 17:12:26 发布

baya123

最新推荐文章于 2024-09-22 17:12:26 发布

阅读量84

点赞数

分类专栏： python 文章标签： python

本文链接：https://blog.csdn.net/baya123/article/details/84270679

版权

python 专栏收录该内容

109 篇文章 0 订阅

订阅专栏

最近为了更好的查询老王python的外链，所以准备写一个python urllib2 查询脚本来查询，一般查询外链比较准确的工具还是yahoo的外链工具，但是有点不方便的就是，yahoo查出的外链要一页一页的翻，好累而且不好方便统计，我是想把网站的外链全部读取到文件里，这样比较好在本地来进行统计。
废话不说，上脚本吧。

#encoding=utf-8
#@author:老王python
#@description:检查网站的外链

import urllib
from sgmllib import SGMLParser
import re
import sys

infolist = {} #结果列表

class LinkParser( SGMLParser ):
    '''抓取link列表的信息'''
    def reset( self ):
        SGMLParser . reset( self)
        self . url = '' #链接文本
        self . li_check = 0
        self . a_check = 0
        self . jt_url = '' #具体url
        self . infolist = {}

    def start_li( self , tag ):
        '''检测li开头'''
        if tag :
            if tag [ 0 ][ 1 ] == 'sco5li0' or tag [ 0 ][ 1 ] == 'sco5li1' :
                    self . li_check = 1

    def start_a( self , tag ):
        '''检测a开头'''
        if self . li_check == 1 :
            if not tag [ 0 ][ 1 ] . startswith( 'http://203.209.253.250' ):
                host_re = re . compile( r'^https?://(.*?)($|/)' ,
                    re . IGNORECASE
                )
                self . url = host_re . search( tag [ 0 ][ 1 ]) . group( 1)
                self . jt_url = tag [ 0 ][ 1 ]
                self . a_check = 1

    def handle_data( self , text ):
        '''处理空白文本'''
        txt = text . strip()

        if txt and self . a_check and txt != '快照' :
            checkurl = ' %s , %s ' % ( self . url , self . jt_url)
            self . infolist [ checkurl ] = txt

        if txt == '' :
            return

    def end_li( self ):
        self . li_check = 0

    def end_a( self ):
        self . a_check = 0

numre = re . compile( r'<strong>.+') #匹配总的记录数
pnum = re . compile( r'\d+')

checkurl = ''#查询网站的地址，比如http://www.xxx.com
checkurl = urllib . quote( checkurl) #请求地址

pageurl = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s ' % ( checkurl)
content = urllib . urlopen( pageurl) . read()
c = numre . search( content) . group( 0)
totalnum = int( pnum . search( c) . group( 0)) #总的外链数

host_re = re . compile( r'^http://(?P<host>www\.(?:[A-Z0-9-]+\.){1}[A-Z\.]{2,6})$' ,
    re . IGNORECASE
)

pagesize = 50 #一页显示50条

if totalnum % pagesize :
    page = totalnum / pagesize
else :
    page = ( totalnum / pagesize) + 1

f = file( 'a.txt' , 'w')

for k in xrange( page ):
    parser = LinkParser()
    url = 'http://sitemap.cn.yahoo.com/search?bwm=i&bwmo=d&p= %s &b= %s ' % ( checkurl , k * 50)
    print 'url=========>' , url
    cstr = urllib . urlopen( url) . read()
    parser . feed( cstr)
    parser . close()

    for m in parser . infolist :
        domain , jt_url = m . split( ',')
        print 'domain--------->' , domain
        print 'jt_url--------->' , jt_url
        t = 'url: %s ,jt_url: %s ,title: %s \n ' % ( domain , jt_url , parser . infolist [ m ] . decode( 'utf-8') . encode( 'utf-8'))
        f . write( t)
f . close()

原创文章请注明转载自老王python,本文地址：http://www.cnpythoner.com/post/121.html

作者:老王@python python 教程
老王python,提供python相关的python 书籍,python 主机,django 教程和python 下载，希望大家能够喜欢!