import urllib.request
import re
def ssubcatagory(urllink,j):
fp = urllib.request.urlopen(urllink)
filecontent = fp.read()
content = str(filecontent)
if(content.find("UTF-8",0,1000) != -1 or content.find("utf-8",0,1000) != -1):
mystr = filecontent.decode('UTF-8')
elif(content.find("gbk",0,1000) != -1 or content.find("GBK",0,1000) != -1):
mystr = filecontent.decode('GBK')
else:
mystr = filecontent.decode('GB2312')
if(j <= 10) :
ssubcata = re.findall(r'<div class="\kag sclearfix\">(.*?)</div></div></div>',mystr)
for i in ssubcata:
#print(i)
sssubcata = re.findall(r'blank\">(.*?)</a>',i)
print("三级:",sssubcata[0])
#urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
for k in urladdress:
print(k[0],k[1])
#print(ssubcata[0])
#print(i,subcata)
def subcatagory(mystr,j):
k = 1
if(j == 1) :
subcata = re.findall(r'生活服务</b></h3><ul class=\"list clearfix\">(.*?)</ul></div>',mystr)
#href="http://gouwu.hao123.com/" class="link"><b>购物</b>
print("一级:生活服务--")
ssubcata = re.findall(r'href=\"(.*?)\" class=\"link\"><b>(.*?)</b>',str(subcata))
for i in ssubcata:
print("二级:",i[0],i[1])
#gouwu caipiao and so on
ssubcatagory(i[0],k)
k = k + 1
#print(ssubcata[0])
#print(i,subcata)
def mainfun():
url = 'http://www.hao123.com/sitemap'
fp = urllib.request.urlopen(url)
content = fp.read()
mystr = content.decode('GBK')
#print(mystr)
file = open('results.txt','w')
file.write(mystr)
file.close
#<div class="section" id="生活服务">
catapattern = re.findall(r'<div class=\"section\" id=\"(.*?)\">',mystr)
#print(catapattern[0])
j = 0
for i in catapattern:
j = j + 1
subcatagory(mystr,j)
#print(catapattern)
fp.close()
mainfun()