0.python-简单的爬虫示例(ip138)

import requests
import xml.etree.ElementTree as ET

from xml.parsers.expat import ParserCreate

class DefaultSaxHandler(object):
     def __init__(self, provinces):
        self.provinces = provinces

     def start_element(self, name, attrs):
        if name != 'map':
            name = attrs['title']
            number = attrs['href']
            self.provinces.append((name, number))

     def end_element(self,name):
        pass

     def char_data(self,text):
         pass

def get_provinces(url):#http://www.ip138.com/post/
    content = requests.get(url).content.decode('gb2312')#取得页面内容并转成gb2312
    start = content.find('<map name=\"map_86\" id=\"map_86\">')#F12参看页面内容
    end  = content.find('</map>')
    content = content[start:end + len('</map>')].strip()#strip用于移除字符串头尾指定的字符(默认为空格或换行符)或字符序列。
    print(content)
    provinces = []
    handler = DefaultSaxHandler(provinces)
    parser = ParserCreate()
    parser.StartElementHandler = handler.start_element #参看帮助文档
    parser.EndElementHandler = handler.end_element
    parser.CharacterDataHandler = handler.char_data
    parser.Parse(content)
    return provinces

def find_provinces(provinces,name):
    for word in provinces:
        if word[0] == name:
            print(word)
            return word[1]


provinces = get_provinces("http://www.ip138.com/post/")
print(provinces)
print(find_provinces(provinces,"宁夏"))
<map name="map_86" id="map_86">
  <area shape="polygon" target="_blank" href="/83/" coords="149,70,146,76,139,89,124,80,119,80,119,86,109,93,109,103,95,102,84,99,85,114,89,125,79,132,79,140,67,151,45,151,32,160,25,147,8,151,1,159,-1,176,5,178,6,189,1,197,10,204,13,220,28,235,33,247,32,254,44,256,56,245,67,251,78,247,85,253,102,252,112,256,117,250,145,248,153,258,167,263,174,262,166,251,173,246,170,234,166,230,167,224,173,221,187,221,194,217,195,212,197,201,204,193,230,181,232,168,240,168,233,148,213,131,198,124,188,123,187,117,190,108,188,88,171,73,165,60,160,62,156,67,150,65" title="新疆" style="background-color: #fff;" />
  <area shape="polygon" target="_blank" href="/85/" coords="34,257,34,280,21,281,21,296,25,304,35,312,41,326,64,334,74,351,82,356,84,362,94,367,111,375,120,375,122,384,131,384,132,377,147,382,156,381,159,384,161,392,166,397,171,402,179,403,183,398,196,396,202,390,216,389,223,396,229,390,234,389,238,385,243,388,248,384,246,381,252,373,247,371,249,346,245,335,237,328,...

[('新疆', '/83/'), ('西藏', '/85/'), ('青海', '/81/'), ('甘肃', '/73/'), ('四川', '/61/'), ('云南', '/65/'), ('宁夏', '/75/'), ('内蒙古', '/01/'), ('黑龙江', '/15/'), ('吉林', '/13/'), ('辽宁', '/11/'), ('河北', '/50/'), ('北京', '/10/'), ('天津', '/30/'), ('陕西', '/71/'), ('山西', '/03/'), ('山东', '/25/'), ('河南', '/45/'), ('重庆', '/40/'), ('湖北', '/43/'), ('安徽', '/23/'), ('江苏', '/21/'), ('上海', '/20/'), ('贵州', '/55/'), ('广西', '/53/'), ('湖南', '/41/'), ('江西', '/33/'), ('浙江', '/31/'), ('福建', '/35/'), ('广东', '/51/'), ('海南', '/57/'), ('台湾', '/taiwang/'), ('澳门', '/aomen/'), ('香港', '/xianggang/')]

('宁夏', '/75/')

/75/

 

 

 

 

 

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值