分享关于学习Python,跟着书籍敲的代码。
第一本书:《Byte Of Python》,给出代码笔记链接:ByteOfPython笔记代码,链接中有此书的PDF格式资源。
第二本书:《Python网络数据采集》,给出此书PDF格式的资源链接:https://pan.baidu.com/s/1eSq6x5g 密码:a46q
此篇给出《Python网络数据采集》第四章:使用API 的代码笔记,供大家参考。
第四章:使用API#-*-coding:utf-8-*-
######使用API
from urllib.error import HTTPError
from urllib.request import urlopen
from bs4 import BeautifulSoup
import urllib
import json
import re
import datetime
import random
####简单的请求头,信息请求
# token="<your api key>"
# webRequest=urllib.request.Request("http://myapi.com",headers={"token":token})
# html=urlopen(webRequest)
# print(html)
####获取反馈数据,之后使用python的json解析函数来解码
##打印出对应ip地址的国家代码
# def getCountry(ipAddress):
# response=urlopen("http://freegeoip.net/json/"+ipAddress).read().decode("utf-8")
# responseJson=json.loads(response)
# # print(responseJson)
# return responseJson.get("country_name")+"----->"+responseJson.get("region_name")
# print(getCountry('50.78.253.58'))#us
# print(getCountry('123.125.71.38'))#cn
# print(getCountry('219.42.0.0'))#jp
####解析JSON字符串
# jsonString='{"arrayOfNums":[{"number":0},{"number":1},{"number":2}],' \
# '"arrayOfFruits":[{"fruit":"apple"},{"fruit":"banana"},{"fruit":"pear"}]}'
#
# jsonObj=json.loads(jsonString)
# print(jsonObj.get('arrayOfNums'))
# print(jsonObj.get('arrayOfNums')[0])
# print(jsonObj.get("arrayOfNums")[1].get("number")+
# jsonObj.get("arrayOfNums")[2].get("number"))
# print(jsonObj.get('arrayOfFruits')[2].get("fruit"))
####根据维基百科编辑的历史的ip,获取维基百科编辑者的所在城市
random.seed(datetime.datetime.now())
def getLinks(articleUrl):
html=urlopen("http://en.wikipedia.org"+articleUrl)
bshtml=BeautifulSoup(html,"html.parser")
return bshtml.find("div",{"id":"bodyContent"}).findAll("a",href=re.compile("^(/wiki/)((?!:).)*$"))
def getHistoryIPs(pageUrl):
# 编辑历史页面URL链接格式是:
# http://en.wikipedia.org/w/index.php?title=Title_in_URL&action=history
pageUrl=pageUrl.replace("/wiki/","")
historyurl="http://en.wikipedia.org/w/index.php?title={0}&action=history".format(pageUrl)
print("历史地址:{0}".format(historyurl))
html=urlopen(historyurl)
bshtml=BeautifulSoup(html,"html.parser")
# 找出class属性是"mw-anonuserlink"的链接
# 它们用IP地址代替用户名
ipAddresses=bshtml.findAll("a",{"class":"mw-anonuserlink"})
addresslist=set()
for ipaddress in ipAddresses:
addresslist.add(ipaddress.get_text())
return addresslist
####调用 1:
# links = getLinks("/wiki/Python_(programming_language)")
#
# while(len(links)>0):
# for link in links:
# print("--------------------")
# historyIPs=getHistoryIPs(link.attrs["href"])
# for ip in historyIPs:
# print(getCountry(ip))
# newLink=links[random.randint(0,len(links)-1)].attrs["href"]
# llinks=getLinks(newLink)
####调用2:
def getCountry(ipAddress):
try:
response = urlopen("http://freegeoip.net/json/"
+ipAddress).read().decode('utf-8')
except HTTPError:
return None
responseJson = json.loads(response)
return responseJson.get("country_name")
links = getLinks("/wiki/Python_(programming_language)")
while(len(links) > 0):
for link in links:
print("-------------------")
historyIPs = getHistoryIPs(link.attrs["href"])
for historyIP in historyIPs:
country = getCountry(historyIP)
if country is not None:
print(historyIP+"\t is from \t"+country)
newLink = links[random.randint(0, len(links)-1)].attrs["href"]
links = getLinks(newLink)
第五章: 存储数据代码笔记