使用 HTTP 访问谷歌的电子数据
由于通过 HTTP 和 XML 公开数据的简单和平台无关性,因此,成为了互联网上公开数据最流行的方法之一,只需要使用 HTTP 和部分 XML 处理,就可以访问数量大得惊人的数据。访问谷歌自己公布的电子数据就是一个很好的应用,在清单 11-4 中我们可以看到如何访问谷歌的电子数据。
注意
我们将要访问的电子数据来自卫报数据商店(Guardian Data Store),它通过谷歌电子出版了英国以及世界上的静态数据,在这里可以发现许多有用的资源:
http://www.guardian.co.uk/data-store。
清单 11-4 使用 HTTP 访问谷歌的电子数据
open System
open System.IO
open System.Net
open System.Xml
open System.Xml.XPath
//some namespace information for the XML
let namespaces =
[ "at", "http://www.w3.org/2005/Atom";
"openSearch","http://a9.com/-/spec/opensearchrss/1.0/";
"gsx","http://schemas.google.com/spreadsheets/2006/extended" ]
//read the XML and process it into amatrix of strings
letqueryGoogleSpreadSheet (xdoc:XmlDocument) xpath columnNames =
let nav = xdoc.CreateNavigator()
let mngr = new XmlNamespaceManager(new NameTable())
do List.iter (fun (prefix, url) -> mngr.AddNamespace(prefix, url)) namespaces
let xpath = nav.Compile(xpath)
do xpath.SetContext(mngr)
let iter = nav.Select(xpath)
seq { for x in iter ->
let x = x :?> XPathNavigator
let getValue nodename =
let node =x.SelectSingleNode(nodename, mngr)
node.Value
Seq.map getValuecolumnNames }
//read the spreadsheet from its webaddress
letgetGoogleSpreadSheet (url: string)columnNames =
let req = WebRequest.Create(url)
use resp = req.GetResponse()
use stream = resp.GetResponseStream()
let xdoc = new XmlDocument()
xdoc.Load(stream)
queryGoogleSpreadSheet xdoc "/at:feed/at:entry" columnNames
// alocation to hold the information we'reinterested in
type Location =
{ Country: string;
NameValuesList:seq<string * option<float>> }
//creates a location from the row names
let createLocationnames row =
let country = Seq.head row
let row = Seq.skip 1 row
let tryParse s =
let success,res = Double.TryParse s
if success then Some res else None
let values = Seq.map tryParse row
{ Country = country;
NameValuesList= Seq.zip names values }
//get the data and process it into records
let getDataAndProcessurl colNames =
//get the names of the columns we want
let cols = Seq.map fst colNames
//get the data
let data = getGoogleSpreadSheet url cols
//get the readable names of the columns
let names = Seq.skip 1 (Seq.map snd colNames)
//create strongly typed records from the data
Seq.map(createLocation names) data
//function to create a spreadsheets URLfrom it's key
let makeUrl = Printf.sprintf"http://spreadsheets.google.com/feeds/list/%s/od6/public/values"
let main() =
//the key of the spreadsheet we're interested in
let sheatKey = "phNtm3LmDZEP61UU2eSN1YA"
//list of column names we're interested in
let cols =
[ "gsx:location", "";
"gsx:hospitalbedsper10000population",
"Hospitalbedsper 1000";
"gsx:nursingandmidwiferypersonneldensityper10000population",
"Nursing andMidwifery Personnel per1000" ];
//get the data
let data = getDataAndProcess (makeUrl sheatKey) cols
//print the data
Seq.iter(printfn "%A") data
do main()
需要引用 System.Xml.dll
]
运行前面的程序,得到如下的结果:
...
{Country = "Sweden";
NameValuesList=
seq
[("Hospitalbeds per 1000", null);
("Nursingand Midwifery Personnel per 1000", Some 109.0)];}
{Country = "Switzerland";
NameValuesList=
seq
[("Hospitalbeds per 1000", Some 57.0);
("Nursingand Midwifery Personnel per 1000", Some 110.0)];}
...
关注这个示例的重要一点是我们用来检索数据的方法几乎没有改变,在代码的核心部分,我们发面有几行代码与生成 HTTP 请求,检索 XML 文档的代码相同:
let req = WebRequest.Create(url)
use resp = req.GetResponse()
use stream = resp.GetResponseStream()
let xdoc = new XmlDocument()
xdoc.Load(stream)
其余的大部分代码都是处理返回的 XML 数据。