公司不给看新闻,这几天弄了个小玩意, 用来看看新闻(虽然目前只是标题,还没实现抓取具体内容。。。)。代码如下:
服务器端由两个文件组成,第一个:
BaiduNews.fs:
View Code
module BaiduNews #if INTERACTIVE #r @"C:\Users\v-shuzhu\Desktop\HtmlAgilityPack.dll" #endif open System open System.Diagnostics open System.Net open System.Xml open System.IO open HtmlAgilityPack //异步 获取 网页数据流 let asyncGrapUrl(newUrl : string) = async{ let fileNameXml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml" if(File.Exists(fileNameXml)) then File.Delete(fileNameXml) let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync()) let responStream = httpRespon.GetResponseStream() let xml = new HtmlDocument() xml.Load(responStream,Text.Encoding.GetEncoding("gb2312"),true) xml.OptionOutputAsXml <- true xml.Save(fileNameXml) responStream.Close() return xml } |> Async.RunSynchronously //这个函数主要用来将无序列表中的标题 抽出来 let extractTextFromUl(ulSeq : seq<HtmlNode>) = ulSeq |> Seq.map(fun ul -> ul.ChildNodes) |> Seq.collect(fun li -> li) |> Seq.map(fun liTxt -> liTxt.InnerText.Trim()) |> Seq.map(fun str -> //删除每行开头中的 空格 let indent = System.Text.RegularExpressions.Regex.Replace(str,"\n[ ]*", System.Environment.NewLine) indent + System.Environment.NewLine) //将此序列合并成一个字符串 |> Seq.fold (+) "" let url = @"http://www.news.baidu.com" //用来保存全部消息 let mutable outputStr = "" let mutable html = asyncGrapUrl(url) //服务器更新一下 let timer = new System.Timers.Timer(1000.0 * 60.0 * 60.0) timer.Elapsed.Add(fun _ -> html <- asyncGrapUrl(url)) timer.Enabled <- true let htmlnode = html.DocumentNode //获取body中最重要的一个div,所有新闻都在此div 中 let content = html.GetElementbyId("container") //let container = content.SelectNodes("/html[1]/body[1]/div[4]/div/div/div/ul") //焦点新闻 let focus() = let focusNews = content.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div/div/div/ul") (focusNews |> extractTextFromUl) outputStr <- outputStr + focus() //国内新闻 let china() = let chinaNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[4]/div/dl/dd/div/ul") let chinaNewsRight = content.SelectNodes("/html[1]/body[1]/div[4]/div[4]/div/dl/dd/ul") let chinaNews = Seq.append chinaNewsLeft chinaNewsRight (chinaNews |> extractTextFromUl) outputStr <- outputStr + china() //国际新闻 let world() = let worldNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[5]/div/dl/dd/div/ul") let worldNewsRight = content.SelectNodes("/html[1]/body[1]/div[4]/div[5]/div/dl/dd/ul") let worldNews = Seq.append worldNewsLeft worldNewsRight (worldNews |> extractTextFromUl) outputStr <- outputStr + world() //社会 let social() = let socialNewsMainPart1 = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/div/ul") let socialNewsMainPart2 = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/div/div/ul") let socialNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/ul") let socialNewsMain =socialNewsAside |> Seq.append(Seq.append socialNewsMainPart1 socialNewsMainPart2) (socialNewsMain |> extractTextFromUl) outputStr <- outputStr + social() //军事新闻 let military() = let militaryNewsPart1 = content.SelectNodes("/html[1]/body[1]/div[4]/div[7]/div/dl/dd/div/ul") let militaryNewsPart2 = content.SelectNodes("/html[1]/body[1]/div[4]/div[7]/div/dl/dd/ul") let militatyNews = Seq.append militaryNewsPart1 militaryNewsPart2 (militatyNews |> extractTextFromUl) outputStr <- outputStr + military() //经济新闻 let economy() = let economyNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[9]/div/dl/dd/div/ul") let economyNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[9]/div/dl/dd/ul") let economyNews = Seq.append economyNewsMain economyNewsAside (economyNews |> extractTextFromUl) outputStr <- outputStr + economy() //网络新闻 let internet() = let internetNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[10]/div/dl/dd/div/ul") let internetNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[10]/div/dl/dd/ul") let internetNews = Seq.append internetNewsMain internetNewsAside (internetNews |> extractTextFromUl) outputStr <- outputStr + internet() //房与车 let houseAndCar() = let houseAndCarNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[12]/div/dl/dd/table/tr/td/div/ul") let houseAndCarNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[12]/div/dl/dd/ul") let houseAndCarNews = Seq.append houseAndCarNewsMain houseAndCarNewsAside (houseAndCarNews |> extractTextFromUl) outputStr <- outputStr + houseAndCar() //体育新闻 let sport() = let sportNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/div/ul") let sportNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/div/div/ul") let sportNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/ul") let sportNews = sportNewsLeft |> Seq.append (Seq.append sportNewsMain sportNewsAside ) (sportNews |> extractTextFromUl) outputStr <- outputStr + sport() //娱乐新闻 let entertainment() = let entertainmentNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[16]/div/dl/dd/div/ul") let entertainmentNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[16]/div/dl/dd/div/div/ul") let entertainmentNews = Seq.append entertainmentNewsMain entertainmentNewsAside (entertainmentNews |> extractTextFromUl) outputStr <- outputStr + entertainment() //健康 let health() = let healthNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[18]/div/dl/dd/table/tr/td/div/ul") let healthNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[18]/div/dl/dd/ul") let healthNews = Seq.append healthNewsMain healthNewsAside (healthNews |> extractTextFromUl) outputStr <- outputStr + health() //科技 let itAndFound() = let itAndFoundNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[21]/div/dl/dd/table/tr/td/div/ul") let itAndFoundNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[21]/div/dl/dd/ul") let itAndFoundNews = Seq.append itAndFoundNewsMain itAndFoundNewsAside (itAndFoundNews |> extractTextFromUl) outputStr <- outputStr + itAndFound() //教育 let educationAndGame() = let educationAndGameNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[22]/div/dl/dd/table/tr/td/div/ul") let educationAndGameNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[22]/div/dl/dd/ul") let educationAndGameNews = Seq.append educationAndGameNewsMain educationAndGameNewsAside (educationAndGameNews |> extractTextFromUl) outputStr <- outputStr + educationAndGame() // //let fileNameTxt = @"D:\" + url.Replace('.','0').Replace('/','0').Replace(':','0') + ".txt" //if(File.Exists(fileNameTxt)) then // File.Delete(fileNameTxt) //let fileInfo = new System.IO.FileStream(fileNameTxt,FileMode.OpenOrCreate,FileAccess.ReadWrite) //let writer = new System.IO.StreamWriter(fileInfo,Text.Encoding.UTF8) //writer.WriteLine(outputStr) //writer.Close() //let procStartInfo = new ProcessStartInfo("notepad.exe",fileNameTxt) //let proc = new Process() //proc.StartInfo <- procStartInfo //proc.Start() |> ignore
这里用到了网上的一个dll:HtmlAgilityPack.dll,这个dll提供了一些功能强大的抓取网页的方法。可以试一下。
另一个文件是Program.fs:
View Code
// Learn more about F# at http://fsharp.net // See the 'F# Tutorial' project for more help. open System.Net.Sockets open System.Net open System.IO open System open System.Collections.Generic open BaiduNews type System.Net.Sockets.TcpListener with member this.AsyncAcceptTcpClient() = Async.FromBeginEnd(this.BeginAcceptTcpClient,this.EndAcceptTcpClient) type Server() = class member this.Start() = let ip = System.Net.IPAddress.Parse("127.0.0.1") let tcpListener = new TcpListener(ip, 4242) tcpListener.Start() timer.Start() while(true) do if(tcpListener.Pending()) then let proc = async{ try let! client = tcpListener.AsyncAcceptTcpClient() let strPipe = client.GetStream() let strReader = new StreamReader(strPipe,Text.Encoding.GetEncoding("gb2312")) let strWriter = new StreamWriter(strPipe)//,Text.Encoding.GetEncoding("gb2312") strWriter.WriteLine("请选择要看的新闻类型,输入序号并回撤 :") strWriter.WriteLine("1:焦点新闻") strWriter.WriteLine("2:国内新闻") strWriter.WriteLine("3:世界新闻") strWriter.WriteLine("4:社会") strWriter.WriteLine("5:军事新闻") strWriter.WriteLine("6:经济新闻") strWriter.WriteLine("7:网络新闻") strWriter.WriteLine("8:房与车") strWriter.WriteLine("9:体育新闻") strWriter.WriteLine("10:娱乐新闻") strWriter.WriteLine("11:健康") strWriter.WriteLine("12:科技") strWriter.WriteLine("13:教育") strWriter.WriteLine("14:全部") strWriter.Flush() while(true) do let buffer = Array.create 216 0uy let read = strPipe.Read(buffer,0,216) let allText = System.Text.Encoding.UTF8.GetString(buffer,0,read) let itemStr = sprintf "%s" (allText) let item = System.Int32.Parse(itemStr) let respStr = match item with | 1 -> BaiduNews.focus() | 2 -> BaiduNews.china() | 3 -> BaiduNews.world() | 4 -> BaiduNews.social() | 5 -> BaiduNews.military() | 6 -> BaiduNews.economy() | 7 -> BaiduNews.internet() | 8 -> BaiduNews.houseAndCar() | 9 -> BaiduNews.sport() | 10 -> BaiduNews.entertainment() | 11 -> BaiduNews.health() | 12 -> BaiduNews.itAndFound() | 13 -> BaiduNews.educationAndGame() | 14 -> BaiduNews.outputStr | _ -> "Inviad index!" strWriter.Write("Got From Server:") System.Environment.NewLine |> strWriter.WriteLine respStr |> strWriter.WriteLine strWriter.Flush() with e -> printfn "%s" e.Message } proc |> Async.Start end [<EntryPoint>] let main argv = (new Server()).Start() 0 // return an integer exit code
客户端只有一个小窗口:),代码如下:
View Code
// Learn more about F# at http://fsharp.net // See the 'F# Tutorial' project for more help. open System.Windows.Forms open System open System.Drawing open System.Net.Sockets open System.IO let form = new Form() form.Text <- "Test" let input = new System.Windows.Forms.TextBox() input.Multiline <- true input.Dock <- DockStyle.Bottom input.Height <- form.Height / 100 * 20 form.Controls.Add(input) let output = new System.Windows.Forms.TextBox() output.Multiline <- true output.ReadOnly <- true output.Dock <- DockStyle.Top output.ScrollBars <- ScrollBars.Both output.Height <- form.Height / 100 * 70 let client = new TcpClient() //10.225.150.4 //127.0.0.1 client.Connect("127.0.0.1",4242) let sr = new StreamReader(client.GetStream(),Text.Encoding.GetEncoding("gb2312")) let sw = new StreamWriter(client.GetStream(),Text.Encoding.GetEncoding("gb2312")) form.Controls.Add(output) //发送消息 let keyUp() = //多于 一行 ,回车 if(input.Lines.Length > 1) then let text = input.Text if (text <> null && text <> "") then try sw.WriteLine(text) sw.Flush() with err -> printfn "Server error" input.Text <- "" input.KeyUp.Add(fun _ -> keyUp()) let getBackStr() = let getStr() = while(true) do let text = sr.ReadLine() if(text<>"" && text<>null)then printfn "%s"text output.Text <- output.Text + text output.AppendText(System.Environment.NewLine) output.AppendText(System.Environment.NewLine) //另起一个专门负责接受服务器回复的Thread let thread = new Threading.Thread(new Threading.ThreadStart(fun _ -> getStr())) thread.Start() form.Load.Add(fun _ -> getBackStr()) form.SizeChanged.Add(fun _ -> input.Height <- form.Height / 100 * 20 output.Height <- form.Height / 100 * 70 ) form.Closing.Add(fun _ -> Application.Exit()) [<EntryPoint>] let main argv = do Application.Run(form) 0 // return an integer exit code
目前实现的功能是 将所有新闻的标题也就是大概新闻抽出来,没有实现进一步获取网页内容。
目前效果:
接下来实现 抓取用户指定的新闻的具体内容:)
更新下,服务端启动就去取数据,固定时间间隔刷新下数据~