根据请求网页状态返回码 批量过滤无效网址

#coding=gbk
 
import os
import httplib2
import socket
import sys
 

def GetWebStatus(host):
    try:
        h =httplib2.Http(timeout=60)             
        resp, content = h.request(host)
        resultStatus =  resp.get('status')        
        if(resultStatus != '200'):
            #print(host+"无效")
            return 0
        else:
            #print(host+"有效")
            return 1
    except Exception:
        print host
        return 0
 
 
def ReadHost(xmlpath):

    obn = open(xmlpath, 'rb');
    line = obn.readline()
    while line:
        line = line.strip('\r\n')
        if("UrlItem url=" in line):
            line = line[14:]
            maohao = line.find('"')
            line = line[:maohao]
        
        if('.' not in line):
            line = obn.readline()
            continue
        
        if('www.' in line):
            ss = 'http://'+line
        else:
            ss = 'http://www.'+line
        
            
        
        if (GetWebStatus(ss) == 0):
            fw = open("c:/wuxiao.txt","a+")
            fw.writelines(line)
            fw.writelines('\n')
            fw.close()   
        else:
            fy = open("c:/valid.txt","a+")
            fy.writelines(line)
            fy.writelines('\n')
            fy.close()  
        line = obn.readline()
    obn.close();

 
if __name__ == "__main__":
    if(len(sys.argv) < 2):
        print 'Error!pls enter the test file!If any question,pls contact coder! version:0.3'
    else:        
        ReadHost(sys.argv[1])

 

 

转载于:https://www.cnblogs.com/xiaobaichuangtianxia/p/4299736.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值