文本文件中获取http链接

#!/usr/bin/python2.7
#coding=utf-8


import sys
import MySQLdb
import time
import hashlib
import datetime
import json
import urllib2
import httplib,urllib
import zim

reload(sys)
sys.setdefaultencoding('utf-8')
del sys.setdefaultencoding
print time.strftime('%Y-%m-%d %H:%M:%S',time.localtime(time.time()))

listtags = []
for i in range(97,123):

listtags.append(chr(i))
listtags.append(str(chr(i)).upper())

for i in range(0,10):

listtags.append(str(i))

listtags.append(str('/'));
listtags.append(str('.'));
listtags.append(str(':'));


def visit_oapi(purl):


response = urllib2.urlopen(purl)
html = response.read()
#print html

conn = MySQLdb.connect(host='10.13.81.11', user='portal',passwd='portal@sohu', db='smc_user', port=3306, charset='utf8')
cur =conn.cursor()
sql = "select passport from tbl_passport_bind_status WHERE app_id =1 ORDER BY id DESC LIMIT 3000"
cur.execute(sql)
uids = cur.fetchall()

url = "http://internal.passport.sohu.com/openlogin/api/timeline/home";
appkey = 'f@JclHjuQ`DORG<f0,OqS/Q(Lpp4&G'
appid = '1106'

for id in uids:
#print id[0]
userid = str(id[0])
ct = str(str(time.time()).split('.')[0])
code = hashlib.md5(userid+appid+appkey+ct).hexdigest()
dicts={}

dicts['userid'] = userid
dicts['openid'] = userid
dicts['ct'] = ct

dicts['code'] = code
dicts['appid'] = appid


sjson = json.dumps(dicts)
print sjson
purl = url +'?json='+ sjson
visit_oapi(url)
req = urllib2.Request(url, sjson)
response = urllib2.urlopen(req)
the_page = response.read()
zim.getHttp(the_page,userid)
#print the_page

--------------------------------------------------

#!/usr/bin/python
#coding=utf-8


import string

listtags = []
for i in range(97,123):

listtags.append(chr(i))
listtags.append(str(chr(i)).upper())

for i in range(0,10):

listtags.append(str(i))

listtags.append(str('/'));
listtags.append(str('.'));
listtags.append(str(':'));
def writeFile(userid, listlink):
fileWriteObj = open("output.txt", 'w')
for i in listlink:
fileWriteObj.write(userid+'\t'+i+'\n')
fileWriteObj.close()

def getHttp(content,userid):

listlinks = []
content = string.lower(content)
inx = string.find(content,'http',0,len(content))
while inx>0:
links =""
hindex = inx+len('http')
content = content[hindex:]
i=0
while i < len(content):
chars = str(content[i])
if chars in listtags:
links+=chars
i+=1
else:
print links
inx = string.find(content,'http',0,len(content))
if string.find(links,'www') < 0:
if(isHttpLink('http'+links)):
listlinks.append('http'+links)
break
writeFile(userid,listlinks)
import urllib2
import httplib,urllib
def isHttpLink(url):
try:
response = urllib2.urlopen(url)
html = response.read()
i = string.find(html,'<title>搜狐新闻客户端-首页</title>')
j = string.find(html,'你访问的页面地址有误')
if i > 0 or j >0:
print ' you are is bad ...........',url
return True
except Exception, e:
print ' you are is error! ...........',url
return False

if __name__ == '__main__':

stra = 'sssgsdgdfhttp://w353.5345中哦噶诺..http://sfjsdlfkjs77888)i'
getHttp(stra,'werw')
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值