<pre name="code" class="python"><strong>从1.txt读取url,并从读取的url的网页中提取url保存在2.txt</strong>
# -*- coding: UTF-8 -*-
import urllib
import re
#定义获取目标网页函数
def getHtml(url):
page = urllib.urlopen(url)
html = page.read()
return html
#定义获取url的函数
def getUrl(html):
reg = r'href="(.+?)"'
urlre = re.compile(reg)
urllist = re.findall(urlre,html)
return urllist
#定义url筛选函数
def selUrl(gurl):
s = list(set(gurl))
#set去重复url
fw = ''
for i in range(len(s)) :
fp = open('2.txt','w')
#判断url是否含有http....头部,没有的加上
if "/"in s[i]:
if "http" in s[i]:
fw += (s[i]+'\n')
fp.write(fw)
else: