文件分类:
handle.py
get_url.py
urlsh.sh
seed_url.good
amazon.good
handle.py文件功能为解析网页源代码提取url
#!/usr/bin/python
import sys,re
#handle.py
patt=re.compile('href="\/s\?ie=UTF8&page=1&rh=n%3A([0-9]*?)">')
url=[]
f=open(sys.argv[1])
f_w=open("amazon.good",'a')
while True:
line=f.readline()
if not line:break
url.append(re.findall(patt,line))
for ur in url:
for u in ur:
f_w.write('"http://www.amazon.com/b?ie=UTF8&node='+u+'"\n')
#f_w.write(u+'\