《Python网络数据采集》第四章(阅读代码笔记)

本文链接：https://blog.csdn.net/qq_34908167/article/details/78849964

分享关于学习Python，跟着书籍敲的代码。

第一本书：《Byte Of Python》，给出代码笔记链接：ByteOfPython笔记代码，链接中有此书的PDF格式资源。

第二本书：《Python网络数据采集》，给出此书PDF格式的资源链接：https://pan.baidu.com/s/1eSq6x5g 密码：a46q

此篇给出《Python网络数据采集》第四章：使用API 的代码笔记，供大家参考。

第四章：使用API

#-*-coding:utf-8-*-

######使用API
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib
import json
import re
import datetime
import random

####简单的请求头，信息请求
# token="<your api key>"
# webRequest=urllib.request.Request("http://myapi.com",headers={"token":token})
# html=urlopen(webRequest)
# print(html)

####获取反馈数据，之后使用python的json解析函数来解码
##打印出对应ip地址的国家代码
# def getCountry(ipAddress):
#     response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
#     responseJson=json.loads(response)
#     # print(responseJson)
#     return responseJson.get("country_name")+"----->"+responseJson.get("region_name")

# print(getCountry('50.78.253.58'))#us
# print(getCountry('123.125.71.38'))#cn
# print(getCountry('219.42.0.0'))#jp

####解析JSON字符串
# jsonString='{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],' \
#            '"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
#
# jsonObj=json.loads(jsonString)
# print(jsonObj.get('arrayOfNums'))
# print(jsonObj.get('arrayOfNums')[0])
# print(jsonObj.get("arrayOfNums")[1].get("number")+
#       jsonObj.get("arrayOfNums")[2].get("number"))
# print(jsonObj.get('arrayOfFruits')[2].get("fruit"))

####根据维基百科编辑的历史的ip，获取维基百科编辑者的所在城市

random.seed(datetime.datetime.now())

def getLinks(articleUrl):
    html=urlopen("http://en.wikipedia.org"+articleUrl)
    bshtml=BeautifulSoup(html,"html.parser")
    return bshtml.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))

def getHistoryIPs(pageUrl):
    # 编辑历史页面URL链接格式是：
    # http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
    pageUrl=pageUrl.replace("/wiki/","")
    historyurl="http://en.wikipedia.org/w/index.php?title={0}&action=history".format(pageUrl)
    print("历史地址：{0}".format(historyurl))

    html=urlopen(historyurl)
    bshtml=BeautifulSoup(html,"html.parser")

    # 找出class属性是"mw-anonuserlink"的链接
    # 它们用IP地址代替用户名
    ipAddresses=bshtml.findAll("a",{"class":"mw-anonuserlink"})
    addresslist=set()
    for ipaddress in ipAddresses:
        addresslist.add(ipaddress.get_text())
    return addresslist

####调用 1:
# links = getLinks("/wiki/Python_(programming_language)")
#
# while(len(links)>0):
#     for link in links:
#         print("--------------------")
#         historyIPs=getHistoryIPs(link.attrs["href"])
#         for ip in historyIPs:
#             print(getCountry(ip))
#     newLink=links[random.randint(0,len(links)-1)].attrs["href"]
#     llinks=getLinks(newLink)


####调用2：
def getCountry(ipAddress):
    try:
        response = urlopen("http://freegeoip.net/json/"
        +ipAddress).read().decode('utf-8')
    except HTTPError:
        return None
    responseJson = json.loads(response)
    return responseJson.get("country_name")
links = getLinks("/wiki/Python_(programming_language)")
while(len(links) > 0):
    for link in links:
        print("-------------------")
        historyIPs = getHistoryIPs(link.attrs["href"])
        for historyIP in historyIPs:
            country = getCountry(historyIP)
            if country is not None:
                print(historyIP+"\t is from \t"+country)
    newLink = links[random.randint(0, len(links)-1)].attrs["href"]
    links = getLinks(newLink)

第五章：存储数据代码笔记