用正则表达式提取网页上表格的内容
把提取到的内容转换成用逗号分隔的文本。
需要修改的就是第一行的 <table id=""t1"" 部分,根据表格定义的而不同。
let rx = new Regex(@"(?<=<table id=""t1""[^>]*?>)([\s\S]*?)(?=</table>)",RegexOptions.IgnoreCase|||RegexOptions.Singleline);;
let rxTR = new Regex(@"(?<=<tr>)([\s\S]*?)(?=</tr>)",RegexOptions.IgnoreCase|||RegexOptions.Singleline);;
let rxTD = new Regex(@"(?<=<td[^>]*>[\s]*?)([\S]*)(?=[\s]*?</td>)",RegexOptions.IgnoreCase|||RegexOptions.Singleline);;
let s4= [for i in rx.Matches(s : string) do
let t0 = i.Groups.[1].Value
yield //t0
[for i in rxTR.Matches(t0 : string) do
let t1 = i.Groups.[1].Value
yield //t1
[for i in rxTD.Matches(t1 : string) do
let t2 = i.Groups.[1].Value
yield t2
]
]
];;
let s5 = s4 |>Seq.iter(fun i -> i|>Seq.iter(fun i -> (String.Join(",",i))|> writeToFile "S4-4.txt" ))