【Python编程】网页URL提取实例

import urllib.request
import re

def ssubcatagory(urllink,j):
    fp = urllib.request.urlopen(urllink)
    filecontent = fp.read()
    content = str(filecontent)

    if(content.find("UTF-8",0,1000) != -1 or content.find("utf-8",0,1000) != -1):
        mystr = filecontent.decode('UTF-8')
    elif(content.find("gbk",0,1000) != -1 or content.find("GBK",0,1000) != -1):
        mystr = filecontent.decode('GBK')
    else:
        mystr = filecontent.decode('GB2312')
           
    if(j <= 10) :
        ssubcata = re.findall(r'<div class="\kag sclearfix\">(.*?)</div></div></div>',mystr)
        for i in ssubcata:
            #print(i)
            sssubcata = re.findall(r'blank\">(.*?)</a>',i)
            print("三级:",sssubcata[0])
            #urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
            urladdress =re.findall(r'href=\"http\://(.*?)\" class=\"kaj\">(.*?)</a>',i)
            for k in urladdress:
                print(k[0],k[1])
               
                               
           
        #print(ssubcata[0])
   
    #print(i,subcata)
 
def subcatagory(mystr,j):
    k = 1
    if(j == 1) :
        subcata = re.findall(r'生活服务</b></h3><ul class=\"list clearfix\">(.*?)</ul></div>',mystr)
        #href="http://gouwu.hao123.com/" class="link"><b>购物</b>
        print("一级:生活服务--")
        ssubcata = re.findall(r'href=\"(.*?)\" class=\"link\"><b>(.*?)</b>',str(subcata))
        for i in ssubcata:
            print("二级:",i[0],i[1])
            #gouwu caipiao and so on
            ssubcatagory(i[0],k)
            k = k + 1
        #print(ssubcata[0])
   
    #print(i,subcata)

def mainfun():
    url = 'http://www.hao123.com/sitemap'
    fp = urllib.request.urlopen(url)
    content = fp.read()
    mystr = content.decode('GBK')
    #print(mystr)

    file = open('results.txt','w')
    file.write(mystr)
    file.close
    #<div class="section" id="生活服务">
    catapattern = re.findall(r'<div class=\"section\" id=\"(.*?)\">',mystr)
    #print(catapattern[0])
    j = 0
    for i in catapattern:
        j = j + 1
        subcatagory(mystr,j)
       
    #print(catapattern)

    fp.close()

mainfun()

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值