本程序编写初衷只是想稍微帮下一朋友从某网页抓取数据资料,代码及逻辑都很简单(目标网页貌似很不怎样,各方面性能都较差...),可修改优化之处很多,仅供业余摆弄。
{
问题来源: http://jdxx.zhs.mofcom.gov.cn/website/btgs.jsp
实现步骤:
1、浏览指定网页,使用者输入查询条件查询;
2、点击 获取网页内容 按钮,进行
2.1、跳到 第N页,保存其内容到 ListView;重复...
2.2、保存 ListView 内容 到 Excel;
}
unit NetFetch;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, Buttons, ExtCtrls, OleCtrls, SHDocVw, XPMan, ComCtrls;
type
Tfrm_Main = class(TForm)
XPManifest1: TXPManifest;
wb_WebContent: TWebBrowser;
Panel1: TPanel;
Panel2: TPanel;
Label1: TLabel;
btn_GotoURL: TSpeedButton;
edt_URL: TEdit;
btn_Close: TButton;
dlg_SaveInfo: TSaveDialog;
Tmr_ClsDlg: TTimer;
btn_GetNetInfo: TButton;
lv_Info: TListView;
lbl_Hints: TLabel;
Tmr_GotoURL: TTimer;
procedure FormShow(Sender: TObject);
procedure FormResize(Sender: TObject);
procedure edt_URLKeyDown(Sender: TObject; var Key: Word;
Shift: TShiftState);
procedure btn_GotoURLClick(Sender: TObject);
procedure btn_GetNetInfoClick(Sender: TObject);
procedure btn_CloseClick(Sender: TObject);
procedure Tmr_GotoURLTimer(Sender: TObject);
procedure Tmr_ClsDlgTimer(Sender: TObject);
private
{ Private declarations }
FTotalCount: string; // 总数据条数
FTotalPageCount: string; // 总页数
public
{ Public declarations }
function GetDatasInfo: Boolean; // 得到数据条数、数据总页数
function GotoNthPage(PageIndex: string): Boolean; // 转至第 PageIndex 页
function SaveInfoToLV: Boolean; // 保存抓取的数据入 ListView
function SaveLVToXLS(FileName: string): Boolean; // 将 ListView 内容存入 Excel
end;
var
frm_Main: Tfrm_Main;
implementation
uses
MSHtml, ComObj, ShellAPI;
{$R *.dfm}
procedure Tfrm_Main.FormShow(Sender: TObject);
begin
Tmr_GotoURL.Enabled:= True;
edt_URL.SetFocus;
end;
procedure Tfrm_Main.FormResize(Sender: TObject);
begin
btn_GetNetInfo.Left:= (Width - 190) div 2;
btn_Close.Left:= btn_GetNetInfo.Left + 130;
end;
procedure Tfrm_Main.edt_URLKeyDown(Sender: TObject; var Key: Word;
Shift: TShiftState);
begin
if Key = VK_RETURN then
begin
Key:= 0;
wb_WebContent.Navigate(edt_URL.Text);
end;
end;
procedure Tfrm_Main.btn_GotoURLClick(Sender: TObject);
begin
wb_WebContent.Navigate(edt_URL.Text);
end;
procedure Tfrm_Main.btn_GetNetInfoClick(Sender: TObject);
var
I: Integer;
J: Integer;
begin
if lv_Info.Items.Count > 0 then
begin
with lv_Info do
begin
Items.BeginUpdate;
Items.Clear;
Items.EndUpdate;
end;
end;
btn_GetNetInfo.Enabled:= False;
btn_Close.Enabled:= False;
try
if not GetDatasInfo then
Exit;
if FTotalPageCount = '1' then // 只 1页,直接保存退出即可
begin
SaveInfoToLV;
end
else
begin // 多于 1页,先保存第 1 页内容,再...
SaveInfoToLV;
for I:= 2 to StrToInt(FTotalPageCount) do
begin
if not GotoNthPage(IntToStr(I)) then
begin
MessageBox(
Handle,
'网页跳转失败,程序无法继续抓取数据! ',
'错误',
MB_OK+ MB_ICONERROR
);
Exit;
end;
lbl_Hints.Caption:= '正在获取第 ' + IntToStr(I) + '/' + FTotalPageCount + ' 页数据信息...';
Update;
Application.ProcessMessages;
while wb_WebContent.Busy do
Application.ProcessMessages;
ShowMessage('转页...'); // 延时,可以优化...
for J:= 0 to 20 do
Application.ProcessMessages;
ShowMessage('转页毕...');
Application.ProcessMessages;
Sleep(200);
Application.ProcessMessages;
while True do
begin
if SaveInfoToLV then
Break
else
begin
if not GotoNthPage(IntToStr(I)) then
begin
MessageBox(
Handle,
PChar('网页跳转失败(第 ' + IntToStr(I) + ' 页),程序无法继续抓取数据! '),
'错误',
MB_OK+ MB_ICONERROR
);
Exit;
end;
end;
end;
Application.ProcessMessages;
end;
end;
finally
btn_GetNetInfo.Enabled:= True;
btn_Close.Enabled:= True;
lbl_Hints.Caption:= '';
Update;
end;
if dlg_SaveInfo.Execute then
begin
if SaveLVToXLS(dlg_SaveInfo.FileName) then
begin
if MessageBox(
Handle,
PChar(
'所获取的网页数据信息已成功保存至 ' + dlg_SaveInfo.FileName +
',是否现在查看?'
),
'提示',
MB_YESNO + MB_ICONQUESTION
) = IDYES then
begin
if ShellExecute(0, 'Open', PChar(dlg_SaveInfo.FileName), nil, nil, SW_SHOW) <= 32 then
MessageBox(Handle, '打开文件失败! ', '提示', MB_OK + MB_ICONINFORMATION);
end;
end;
end;
with lv_Info do
begin
Items.BeginUpdate;
Items.Clear;
Items.EndUpdate;
end;
end;
procedure Tfrm_Main.btn_CloseClick(Sender: TObject);
begin
Close;
end;
procedure Tfrm_Main.Tmr_GotoURLTimer(Sender: TObject);
begin
Tmr_GotoURL.Enabled:= False;
Screen.Cursor:= crHourGlass;
wb_WebContent.Navigate(edt_URL.Text);
while wb_WebContent.Busy do
Application.ProcessMessages;
Screen.Cursor:= crDefault;
end;
procedure Tfrm_Main.Tmr_ClsDlgTimer(Sender: TObject);
var
H: THandle;
begin
H:= FindWindow('TMessageForm', PChar(Application.Title));
if H > 0 then
begin
SendMessage(H, WM_KEYDOWN, VK_SPACE, 0);
SendMessage(H, WM_CLOSE, 0, 0);
end;
end;
function Tfrm_Main.GetDatasInfo: Boolean;
var
ws: string;
begin
Result:= False;
ws:= (wb_WebContent.Document as IHTMLDocument2).Body.outerHTML;
// 总数据条数
Delete(ws, 1, Pos('<TD>共 ', ws) + 6);
FTotalCount:= Copy(ws, 1, Pos(' 条信息', ws) - 1);
if FTotalCount = '0' then
begin
MessageBox(Handle, '没有可供获取的网页数据信息! ', '提示', MB_OK + MB_ICONINFORMATION);
Exit;
end;
// 总分页数
Delete(ws, 1, Pos('页次:', ws) + 7);
FTotalPageCount:= Copy(ws, 1, Pos('页 20篇', ws) - 1);
if FTotalPageCount = '0' then
begin
MessageBox(Handle, '没有可供获取的网页数据信息!', '提示', MB_OK + MB_ICONINFORMATION);
Exit;
end;
Result:= True;
end;
function Tfrm_Main.GotoNthPage(PageIndex: string): Boolean;
var
Doc: IHTMLDocument2;
Form: IHTMLFormElement;
Elements: IHTMLElementCollection;
InputElem: IHTMLInputElement;
I: Integer;
begin
Result:= False;
Doc:= wb_WebContent.Document as IHTMLDocument2;
Elements:= Doc.Forms as IHTMLElementCollection;
Form:= Elements.Item(0, varEmpty) as IHTMLFormElement;
Elements:= (Doc.All as IHTMLElementCollection).tags('input') as IHTMLElementCollection;
for I:= 0 to Elements.Length - 1 do // 找到,并填充页码文本框
begin
InputElem:= Elements.Item(I, varEmpty) as IHTMLInputElement;
if UpperCase(Trim(InputElem.Name)) = 'CPF.CPAGE' then
begin
{
问题来源: http://jdxx.zhs.mofcom.gov.cn/website/btgs.jsp
实现步骤:
1、浏览指定网页,使用者输入查询条件查询;
2、点击 获取网页内容 按钮,进行
2.1、跳到 第N页,保存其内容到 ListView;重复...
2.2、保存 ListView 内容 到 Excel;
}
unit NetFetch;
interface
uses
Windows, Messages, SysUtils, Variants, Classes, Graphics, Controls, Forms,
Dialogs, StdCtrls, Buttons, ExtCtrls, OleCtrls, SHDocVw, XPMan, ComCtrls;
type
Tfrm_Main = class(TForm)
XPManifest1: TXPManifest;
wb_WebContent: TWebBrowser;
Panel1: TPanel;
Panel2: TPanel;
Label1: TLabel;
btn_GotoURL: TSpeedButton;
edt_URL: TEdit;
btn_Close: TButton;
dlg_SaveInfo: TSaveDialog;
Tmr_ClsDlg: TTimer;
btn_GetNetInfo: TButton;
lv_Info: TListView;
lbl_Hints: TLabel;
Tmr_GotoURL: TTimer;
procedure FormShow(Sender: TObject);
procedure FormResize(Sender: TObject);
procedure edt_URLKeyDown(Sender: TObject; var Key: Word;
Shift: TShiftState);
procedure btn_GotoURLClick(Sender: TObject);
procedure btn_GetNetInfoClick(Sender: TObject);
procedure btn_CloseClick(Sender: TObject);
procedure Tmr_GotoURLTimer(Sender: TObject);
procedure Tmr_ClsDlgTimer(Sender: TObject);
private
{ Private declarations }
FTotalCount: string; // 总数据条数
FTotalPageCount: string; // 总页数
public
{ Public declarations }
function GetDatasInfo: Boolean; // 得到数据条数、数据总页数
function GotoNthPage(PageIndex: string): Boolean; // 转至第 PageIndex 页
function SaveInfoToLV: Boolean; // 保存抓取的数据入 ListView
function SaveLVToXLS(FileName: string): Boolean; // 将 ListView 内容存入 Excel
end;
var
frm_Main: Tfrm_Main;
implementation
uses
MSHtml, ComObj, ShellAPI;
{$R *.dfm}
procedure Tfrm_Main.FormShow(Sender: TObject);
begin
Tmr_GotoURL.Enabled:= True;
edt_URL.SetFocus;
end;
procedure Tfrm_Main.FormResize(Sender: TObject);
begin
btn_GetNetInfo.Left:= (Width - 190) div 2;
btn_Close.Left:= btn_GetNetInfo.Left + 130;
end;
procedure Tfrm_Main.edt_URLKeyDown(Sender: TObject; var Key: Word;
Shift: TShiftState);
begin
if Key = VK_RETURN then
begin
Key:= 0;
wb_WebContent.Navigate(edt_URL.Text);
end;
end;
procedure Tfrm_Main.btn_GotoURLClick(Sender: TObject);
begin
wb_WebContent.Navigate(edt_URL.Text);
end;
procedure Tfrm_Main.btn_GetNetInfoClick(Sender: TObject);
var
I: Integer;
J: Integer;
begin
if lv_Info.Items.Count > 0 then
begin
with lv_Info do
begin
Items.BeginUpdate;
Items.Clear;
Items.EndUpdate;
end;
end;
btn_GetNetInfo.Enabled:= False;
btn_Close.Enabled:= False;
try
if not GetDatasInfo then
Exit;
if FTotalPageCount = '1' then // 只 1页,直接保存退出即可
begin
SaveInfoToLV;
end
else
begin // 多于 1页,先保存第 1 页内容,再...
SaveInfoToLV;
for I:= 2 to StrToInt(FTotalPageCount) do
begin
if not GotoNthPage(IntToStr(I)) then
begin
MessageBox(
Handle,
'网页跳转失败,程序无法继续抓取数据! ',
'错误',
MB_OK+ MB_ICONERROR
);
Exit;
end;
lbl_Hints.Caption:= '正在获取第 ' + IntToStr(I) + '/' + FTotalPageCount + ' 页数据信息...';
Update;
Application.ProcessMessages;
while wb_WebContent.Busy do
Application.ProcessMessages;
ShowMessage('转页...'); // 延时,可以优化...
for J:= 0 to 20 do
Application.ProcessMessages;
ShowMessage('转页毕...');
Application.ProcessMessages;
Sleep(200);
Application.ProcessMessages;
while True do
begin
if SaveInfoToLV then
Break
else
begin
if not GotoNthPage(IntToStr(I)) then
begin
MessageBox(
Handle,
PChar('网页跳转失败(第 ' + IntToStr(I) + ' 页),程序无法继续抓取数据! '),
'错误',
MB_OK+ MB_ICONERROR
);
Exit;
end;
end;
end;
Application.ProcessMessages;
end;
end;
finally
btn_GetNetInfo.Enabled:= True;
btn_Close.Enabled:= True;
lbl_Hints.Caption:= '';
Update;
end;
if dlg_SaveInfo.Execute then
begin
if SaveLVToXLS(dlg_SaveInfo.FileName) then
begin
if MessageBox(
Handle,
PChar(
'所获取的网页数据信息已成功保存至 ' + dlg_SaveInfo.FileName +
',是否现在查看?'
),
'提示',
MB_YESNO + MB_ICONQUESTION
) = IDYES then
begin
if ShellExecute(0, 'Open', PChar(dlg_SaveInfo.FileName), nil, nil, SW_SHOW) <= 32 then
MessageBox(Handle, '打开文件失败! ', '提示', MB_OK + MB_ICONINFORMATION);
end;
end;
end;
with lv_Info do
begin
Items.BeginUpdate;
Items.Clear;
Items.EndUpdate;
end;
end;
procedure Tfrm_Main.btn_CloseClick(Sender: TObject);
begin
Close;
end;
procedure Tfrm_Main.Tmr_GotoURLTimer(Sender: TObject);
begin
Tmr_GotoURL.Enabled:= False;
Screen.Cursor:= crHourGlass;
wb_WebContent.Navigate(edt_URL.Text);
while wb_WebContent.Busy do
Application.ProcessMessages;
Screen.Cursor:= crDefault;
end;
procedure Tfrm_Main.Tmr_ClsDlgTimer(Sender: TObject);
var
H: THandle;
begin
H:= FindWindow('TMessageForm', PChar(Application.Title));
if H > 0 then
begin
SendMessage(H, WM_KEYDOWN, VK_SPACE, 0);
SendMessage(H, WM_CLOSE, 0, 0);
end;
end;
function Tfrm_Main.GetDatasInfo: Boolean;
var
ws: string;
begin
Result:= False;
ws:= (wb_WebContent.Document as IHTMLDocument2).Body.outerHTML;
// 总数据条数
Delete(ws, 1, Pos('<TD>共 ', ws) + 6);
FTotalCount:= Copy(ws, 1, Pos(' 条信息', ws) - 1);
if FTotalCount = '0' then
begin
MessageBox(Handle, '没有可供获取的网页数据信息! ', '提示', MB_OK + MB_ICONINFORMATION);
Exit;
end;
// 总分页数
Delete(ws, 1, Pos('页次:', ws) + 7);
FTotalPageCount:= Copy(ws, 1, Pos('页 20篇', ws) - 1);
if FTotalPageCount = '0' then
begin
MessageBox(Handle, '没有可供获取的网页数据信息!', '提示', MB_OK + MB_ICONINFORMATION);
Exit;
end;
Result:= True;
end;
function Tfrm_Main.GotoNthPage(PageIndex: string): Boolean;
var
Doc: IHTMLDocument2;
Form: IHTMLFormElement;
Elements: IHTMLElementCollection;
InputElem: IHTMLInputElement;
I: Integer;
begin
Result:= False;
Doc:= wb_WebContent.Document as IHTMLDocument2;
Elements:= Doc.Forms as IHTMLElementCollection;
Form:= Elements.Item(0, varEmpty) as IHTMLFormElement;
Elements:= (Doc.All as IHTMLElementCollection).tags('input') as IHTMLElementCollection;
for I:= 0 to Elements.Length - 1 do // 找到,并填充页码文本框
begin
InputElem:= Elements.Item(I, varEmpty) as IHTMLInputElement;
if UpperCase(Trim(InputElem.Name)) = 'CPF.CPAGE' then
begin
转载于:https://blog.51cto.com/adelphicoder/214662