爬虫-----爬取所有国家的首都、面积，并保存到txt文件中

最新推荐文章于 2020-12-04 02:12:15 发布

weixin_30663471

最新推荐文章于 2020-12-04 02:12:15 发布

阅读量230

点赞数

文章标签：爬虫

原文链接：http://www.cnblogs.com/wozuilang-mdzz/p/9737265.html

版权

# -*- coding:utf-8 -*-

import urllib2
import lxml.html
from lxml import etree

def main():
    file = open('./countrys.txt', 'w+')
    file.close()
    countrys = []
    url = 'https://guojiadiqu.51240.com/'
    html = urllib2.urlopen(url).read()
    # tree = lxml.html.fromstring(html)
    # td = tree.cssselect('div#main_content > ul.list > li > a > @href')[0]
    selector = etree.HTML(html)
    uls = selector.xpath('//div[@id="main_content"]/ul')
    for ul in uls:
        lis = ul.xpath('./li')
        for li in lis:
            country_infos = {}
            key = li.xpath('./a/text()')[0]
            value = 'https://guojiadiqu.51240.com' + li.xpath('./a/@href')[0].strip()
            country_infos[key] = value
            countrys.append(country_infos)
    return get_capital(countrys)

def get_capital(list):
    i = 0
    for country in list:
        i += 1
        name = country.keys()[0]
        url = country.values()[0]
        html = urllib2.urlopen(url).read()
        tree = etree.HTML(html)
        tr = tree.xpath('//div[@id="main_content"]/table')[0]
        tr1 = tr.xpath('./tr/td/table/tr')
        tr2 = tr1[2].xpath('./td/text()')
        file = open('./countrys.txt', 'a')
        if len(tr2) > 0:
            content = str(i) + '  ' + name + '\n   ' + tr2[0] + '\n'
        else:
            content = str(i) + '  ' + name + '\n' + '   \n'
        file.write(content.encode('utf-8'))
        file.close()

if __name__ == "__main__":
    main()