F#小程序 -- 抓取百度新闻

公司不给看新闻,这几天弄了个小玩意, 用来看看新闻(虽然目前只是标题,还没实现抓取具体内容。。。)。代码如下:

服务器端由两个文件组成,第一个:

BaiduNews.fs:

View Code
module BaiduNews

#if INTERACTIVE
#r @"C:\Users\v-shuzhu\Desktop\HtmlAgilityPack.dll"
#endif
open System
open System.Diagnostics
open System.Net
open System.Xml
open System.IO
open HtmlAgilityPack

//异步 获取 网页数据流  
let asyncGrapUrl(newUrl : string) =
    async{            
        let fileNameXml = @"D:\" + newUrl.Replace('.','0').Replace('/','0').Replace(':','0') + ".xml"       

        if(File.Exists(fileNameXml)) then
            File.Delete(fileNameXml)
                    
        let httpRequest = HttpWebRequest.Create(newUrl) :?> HttpWebRequest
        let! httpRespon = Async.AwaitTask(httpRequest.GetResponseAsync())
        let responStream = httpRespon.GetResponseStream()

        let xml = new HtmlDocument()
        xml.Load(responStream,Text.Encoding.GetEncoding("gb2312"),true)
        xml.OptionOutputAsXml <- true
        xml.Save(fileNameXml)

        responStream.Close()
        return xml
    } |> Async.RunSynchronously   

//这个函数主要用来将无序列表中的标题 抽出来     
let extractTextFromUl(ulSeq : seq<HtmlNode>) =
    ulSeq
        |> Seq.map(fun ul -> ul.ChildNodes)
        |> Seq.collect(fun li -> li) 
        |> Seq.map(fun liTxt -> liTxt.InnerText.Trim())
        |> Seq.map(fun str -> 
                        //删除每行开头中的 空格
                        let indent = System.Text.RegularExpressions.Regex.Replace(str,"\n[ ]*", System.Environment.NewLine)
                        indent + System.Environment.NewLine)    
        //将此序列合并成一个字符串
        |> Seq.fold (+) ""

let url = @"http://www.news.baidu.com"
//用来保存全部消息
let mutable outputStr = ""
let mutable html = asyncGrapUrl(url) 

//服务器更新一下
let timer = new System.Timers.Timer(1000.0 * 60.0 * 60.0)
timer.Elapsed.Add(fun _ -> html <- asyncGrapUrl(url))
timer.Enabled <- true

let htmlnode = html.DocumentNode
//获取body中最重要的一个div,所有新闻都在此div 中 
let content = html.GetElementbyId("container")

//let container = content.SelectNodes("/html[1]/body[1]/div[4]/div/div/div/ul")

//焦点新闻
let focus() = 
    let focusNews = content.SelectNodes("/html[1]/body[1]/div[4]/div[3]/div/div/div/ul")    
    (focusNews |> extractTextFromUl)

outputStr <- outputStr + focus()

//国内新闻
let china() =
    let chinaNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[4]/div/dl/dd/div/ul")
    let chinaNewsRight = content.SelectNodes("/html[1]/body[1]/div[4]/div[4]/div/dl/dd/ul")
    let chinaNews = Seq.append chinaNewsLeft chinaNewsRight
    (chinaNews |> extractTextFromUl)

outputStr <- outputStr + china()
//国际新闻
let world() = 
    let worldNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[5]/div/dl/dd/div/ul")
    let worldNewsRight = content.SelectNodes("/html[1]/body[1]/div[4]/div[5]/div/dl/dd/ul")
    let worldNews = Seq.append worldNewsLeft worldNewsRight
    (worldNews |> extractTextFromUl)

outputStr <- outputStr + world()

//社会
let social() = 
    let socialNewsMainPart1 = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/div/ul")
    let socialNewsMainPart2 = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/div/div/ul")
    let socialNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[6]/div/dl/dd/ul")
    let socialNewsMain =socialNewsAside |> Seq.append(Seq.append socialNewsMainPart1 socialNewsMainPart2)
    (socialNewsMain |> extractTextFromUl)

outputStr <- outputStr + social()
//军事新闻
let military() = 
    let militaryNewsPart1 = content.SelectNodes("/html[1]/body[1]/div[4]/div[7]/div/dl/dd/div/ul")
    let militaryNewsPart2 = content.SelectNodes("/html[1]/body[1]/div[4]/div[7]/div/dl/dd/ul")
    let militatyNews = Seq.append militaryNewsPart1 militaryNewsPart2
    (militatyNews |> extractTextFromUl)

outputStr <- outputStr + military()
//经济新闻
let economy() =
    let economyNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[9]/div/dl/dd/div/ul")
    let economyNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[9]/div/dl/dd/ul")
    let economyNews = Seq.append economyNewsMain economyNewsAside
    (economyNews |> extractTextFromUl)

outputStr <- outputStr + economy()
//网络新闻
let internet() =
    let internetNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[10]/div/dl/dd/div/ul")
    let internetNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[10]/div/dl/dd/ul")
    let internetNews = Seq.append internetNewsMain internetNewsAside
    (internetNews |> extractTextFromUl)

outputStr <- outputStr + internet()
//房与车
let houseAndCar() =
    let houseAndCarNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[12]/div/dl/dd/table/tr/td/div/ul")
    let houseAndCarNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[12]/div/dl/dd/ul")
    let houseAndCarNews = Seq.append houseAndCarNewsMain houseAndCarNewsAside
    (houseAndCarNews |> extractTextFromUl)

outputStr <- outputStr + houseAndCar()
//体育新闻
let sport() = 
    let sportNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/div/ul")
    let sportNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/div/div/ul")
    let sportNewsLeft = content.SelectNodes("/html[1]/body[1]/div[4]/div[13]/div/dl/dd/ul")
    let sportNews = sportNewsLeft |> Seq.append (Seq.append sportNewsMain sportNewsAside )

    (sportNews |> extractTextFromUl)

outputStr <- outputStr + sport()
//娱乐新闻
let entertainment() =
    let entertainmentNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[16]/div/dl/dd/div/ul")
    let entertainmentNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[16]/div/dl/dd/div/div/ul")
    let entertainmentNews = Seq.append entertainmentNewsMain entertainmentNewsAside
    (entertainmentNews |> extractTextFromUl)

outputStr <- outputStr + entertainment()
//健康
let health() =
    let healthNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[18]/div/dl/dd/table/tr/td/div/ul")
    let healthNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[18]/div/dl/dd/ul")
    let healthNews = Seq.append healthNewsMain healthNewsAside
    (healthNews |> extractTextFromUl)

outputStr <- outputStr + health()

//科技
let itAndFound() =
    let itAndFoundNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[21]/div/dl/dd/table/tr/td/div/ul")
    let itAndFoundNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[21]/div/dl/dd/ul")
    let itAndFoundNews = Seq.append itAndFoundNewsMain itAndFoundNewsAside
    (itAndFoundNews |> extractTextFromUl)

outputStr <- outputStr + itAndFound()
//教育
let educationAndGame() =
    let educationAndGameNewsMain = content.SelectNodes("/html[1]/body[1]/div[4]/div[22]/div/dl/dd/table/tr/td/div/ul")  
    let educationAndGameNewsAside = content.SelectNodes("/html[1]/body[1]/div[4]/div[22]/div/dl/dd/ul")
    let educationAndGameNews = Seq.append educationAndGameNewsMain educationAndGameNewsAside
    (educationAndGameNews |> extractTextFromUl)

outputStr <- outputStr + educationAndGame()
//
//let fileNameTxt = @"D:\" + url.Replace('.','0').Replace('/','0').Replace(':','0') + ".txt"
//if(File.Exists(fileNameTxt)) then
//    File.Delete(fileNameTxt)
//let fileInfo = new System.IO.FileStream(fileNameTxt,FileMode.OpenOrCreate,FileAccess.ReadWrite)
//let writer = new System.IO.StreamWriter(fileInfo,Text.Encoding.UTF8)
//writer.WriteLine(outputStr)
//writer.Close()
//let procStartInfo = new ProcessStartInfo("notepad.exe",fileNameTxt)
//let proc = new Process()
//proc.StartInfo <- procStartInfo
//proc.Start() |> ignore

这里用到了网上的一个dll:HtmlAgilityPack.dll,这个dll提供了一些功能强大的抓取网页的方法。可以试一下。

另一个文件是Program.fs:

View Code
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
open System.Net.Sockets
open System.Net
open System.IO
open System
open System.Collections.Generic
open BaiduNews
type System.Net.Sockets.TcpListener with
    member this.AsyncAcceptTcpClient() =
       Async.FromBeginEnd(this.BeginAcceptTcpClient,this.EndAcceptTcpClient)

type Server() =
    class
        member this.Start() =  
            let ip = System.Net.IPAddress.Parse("127.0.0.1")
            
            let tcpListener = new TcpListener(ip, 4242)
            tcpListener.Start()
            timer.Start()
            while(true) do
                if(tcpListener.Pending()) then                    
                    let proc =
                        async{
                            try
                                let! client = tcpListener.AsyncAcceptTcpClient()
                        
                                let strPipe = client.GetStream()
                                let strReader = new StreamReader(strPipe,Text.Encoding.GetEncoding("gb2312"))
                                let strWriter = new StreamWriter(strPipe)//,Text.Encoding.GetEncoding("gb2312")
                                strWriter.WriteLine("请选择要看的新闻类型,输入序号并回撤 :")
                                strWriter.WriteLine("1:焦点新闻")
                                strWriter.WriteLine("2:国内新闻")
                                strWriter.WriteLine("3:世界新闻")
                                strWriter.WriteLine("4:社会")
                                strWriter.WriteLine("5:军事新闻")
                                strWriter.WriteLine("6:经济新闻")
                                strWriter.WriteLine("7:网络新闻")
                                strWriter.WriteLine("8:房与车")
                                strWriter.WriteLine("9:体育新闻")
                                strWriter.WriteLine("10:娱乐新闻")
                                strWriter.WriteLine("11:健康")                                
                                strWriter.WriteLine("12:科技")
                                strWriter.WriteLine("13:教育")
                                strWriter.WriteLine("14:全部")
                                strWriter.Flush()                             
                                while(true) do                           
                                    let buffer = Array.create 216 0uy
                                    let read = strPipe.Read(buffer,0,216)
                                    let allText = System.Text.Encoding.UTF8.GetString(buffer,0,read)
                        
                                    let itemStr = sprintf "%s" (allText)
                                    let item = System.Int32.Parse(itemStr)
                                    let respStr = 
                                        match item with
                                        | 1 -> BaiduNews.focus()
                                        | 2 -> BaiduNews.china()
                                        | 3 -> BaiduNews.world()
                                        | 4 -> BaiduNews.social()
                                        | 5 -> BaiduNews.military()
                                        | 6 -> BaiduNews.economy()
                                        | 7 -> BaiduNews.internet()
                                        | 8 -> BaiduNews.houseAndCar()
                                        | 9 -> BaiduNews.sport()
                                        | 10 -> BaiduNews.entertainment()
                                        | 11 -> BaiduNews.health()                                        
                                        | 12 -> BaiduNews.itAndFound()
                                        | 13 -> BaiduNews.educationAndGame()
                                        | 14 -> BaiduNews.outputStr
                                        | _ -> "Inviad index!"
                                    strWriter.Write("Got From Server:")
                                    System.Environment.NewLine |> strWriter.WriteLine                         
                                    respStr |> strWriter.WriteLine
                                    strWriter.Flush()
                            with 
                                    e  -> printfn "%s" e.Message
                            }
                    proc |> Async.Start
    end
[<EntryPoint>]
let main argv =     
    (new Server()).Start()
    0 // return an integer exit code

客户端只有一个小窗口:),代码如下:

View Code
// Learn more about F# at http://fsharp.net
// See the 'F# Tutorial' project for more help.
open System.Windows.Forms
open System
open System.Drawing
open System.Net.Sockets
open System.IO 
let form = new Form()

form.Text <- "Test"
let input = new System.Windows.Forms.TextBox()
input.Multiline <- true
input.Dock <- DockStyle.Bottom
input.Height <- form.Height / 100 * 20
form.Controls.Add(input)

let output = new System.Windows.Forms.TextBox()
output.Multiline <- true
output.ReadOnly <- true
output.Dock <- DockStyle.Top

output.ScrollBars <- ScrollBars.Both
output.Height <- form.Height / 100 * 70

let client = new TcpClient()
//10.225.150.4
//127.0.0.1
client.Connect("127.0.0.1",4242)

let sr = new StreamReader(client.GetStream(),Text.Encoding.GetEncoding("gb2312"))
let sw = new StreamWriter(client.GetStream(),Text.Encoding.GetEncoding("gb2312"))
form.Controls.Add(output)
//发送消息
let keyUp() =   
    //多于 一行 ,回车 
    if(input.Lines.Length > 1) then
        let text = input.Text
        if (text <> null && text <> "") then
            try
                sw.WriteLine(text)
                sw.Flush()      
            with err ->
                printfn "Server error"               
        input.Text <- ""

input.KeyUp.Add(fun _ -> keyUp())

let getBackStr() =
    let getStr() = 
        while(true) do
            let text = sr.ReadLine()
            if(text<>"" && text<>null)then
                printfn "%s"text                
                output.Text <- output.Text + text
                output.AppendText(System.Environment.NewLine)
                output.AppendText(System.Environment.NewLine)
                
                

    //另起一个专门负责接受服务器回复的Thread
    let thread = new Threading.Thread(new Threading.ThreadStart(fun _ -> getStr()))
    thread.Start()
form.Load.Add(fun _ -> getBackStr())
form.SizeChanged.Add(fun _ ->
    input.Height <- form.Height / 100 * 20
    output.Height <- form.Height / 100 * 70
    )
form.Closing.Add(fun _ -> Application.Exit())


[<EntryPoint>]
let main argv = 
    do Application.Run(form)
    0 // return an integer exit code

目前实现的功能是 将所有新闻的标题也就是大概新闻抽出来,没有实现进一步获取网页内容。

目前效果:

接下来实现 抓取用户指定的新闻的具体内容:)

 更新下,服务端启动就去取数据,固定时间间隔刷新下数据~

转载于:https://www.cnblogs.com/FsharpZack/archive/2013/01/06/2848073.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值