// download by http://www.codefans.net/soft/5667.shtml #if !defined AFX_TESTDLG_H__ #define AFX_TESTDLG_H__ #pragma once #pragma warning (disable : 4786) #include <string> #include <map> #include <list> #include <queue> using namespace std; struct Bus_t { bool bSaved; // 是否获取过了 string busName; // 线路名称 string info; // 简介 string url; // 对应的 url 后缀 list<string> stationsGo; // 站台 去程 list<string> stationsBack; // 站台 回程 }; /*========================================================================*/ class CTestDlg : public CDialog { public: CFont m_font; CImageList m_imagelist; CTestDlg(CWnd* pParent = NULL); //{{AFX_DATA(CTestDlg) enum { IDD = IDD_TEST_DIALOG }; CProgressCtrl m_pp; CListCtrl m_list; //}}AFX_DATA //{{AFX_VIRTUAL(CTestDlg) protected: virtual void DoDataExchange(CDataExchange* pDX); // DDX/DDV support //}}AFX_VIRTUAL protected: void GetBusInfo(); void SaveToFile(); void SaveToFile(FILE * fp, Bus_t & t); char * GetFileName(); map<string, Bus_t> m_mapBus; //{{AFX_MSG(CTestDlg) virtual BOOL OnInitDialog(); afx_msg void OnButton1(); afx_msg void OnKillfocusEdit1(); //}}AFX_MSG DECLARE_MESSAGE_MAP() }; /*========================================================================*/ CString getHTML(CString strURL); void StripTags(LPTSTR pszBuffer); void SplitStations(char * pstation, list<string>& lstStation); void WriteFile(const char * pszContent, const char * pszFilename); char * StrToInt(const char * str, int & n); string FormatBusLineToOrder(const char sLine[]); char * GetFileName(const char * url); #endif ------------------ #include "stdafx.h" #include "Test.h" #include "TestDlg.h" #include <afxinet.h> #ifdef _DEBUG #define new DEBUG_NEW #undef THIS_FILE static char THIS_FILE[] = __FILE__; #endif /*========================================================================*/ // 深圳公交 "http://shenzhen.8684.cn/" // "981路" "x_7ca8d133" 测试 网页内容没有抓全的问题 // "209路区间快车" "x_32245c22" 测试 抓取回程宕机的问题,这是单程线路 // "1路" "x_24f5dad9" 抓取数据的起始线路,其实随便一个线路都可以 // "220路" "x_d397d994" 测试 遇到带"站)"的站台后,回程线路获取不全的问题 // "高峰专线41" "x_28a86909" 单向行驶 未抓取到数据 // 上海公交 "http://shanghai.8684.cn/" // "01路" "x_ccc80acf" // 北京公交 "http://beijing.8684.cn" // "1路" "x_24f5dad9" // 济南 "http://jinan.8684.cn/" // "1路" "x_24f5dad9" #define STR_HTTP_URL "http://jinan.8684.cn/" #define STR_BUS "1路" #define STR_URL "x_24f5dad9" #define STR_GO "去程" #define STR_BACK "回程" #define STR_SINGLE "单向行驶" #define STR_LINE "相关线路" #define STR_HREF_END "</a>" CTestDlg::CTestDlg(CWnd* pParent /*=NULL*/) : CDialog(CTestDlg::IDD, pParent) { //{{AFX_DATA_INIT(CTestDlg) //}}AFX_DATA_INIT } void CTestDlg::DoDataExchange(CDataExchange* pDX) { CDialog::DoDataExchange(pDX); //{{AFX_DATA_MAP(CTestDlg) DDX_Control(pDX, IDC_PROGRESS1, m_pp); DDX_Control(pDX, IDC_LIST1, m_list); //}}AFX_DATA_MAP } BEGIN_MESSAGE_MAP(CTestDlg, CDialog) //{{AFX_MSG_MAP(CTestDlg) ON_BN_CLICKED(IDC_BUTTON1, OnButton1) ON_EN_KILLFOCUS(IDC_EDIT1, OnKillfocusEdit1) //}}AFX_MSG_MAP END_MESSAGE_MAP() /*========================================================================*/ #define MAX 10 BOOL CTestDlg::OnInitDialog() { CDialog::OnInitDialog(); m_imagelist.Create(16,16,TRUE,2,2); m_imagelist.Add(AfxGetApp()->LoadIcon(IDI_ICON1)); m_list.SetImageList(&m_imagelist,LVSIL_SMALL); m_font.CreateFont(16, 0,0,0,FW_NORMAL, 0,0,0, DEFAULT_CHARSET, OUT_CHARACTER_PRECIS, CLIP_CHARACTER_PRECIS, DEFAULT_QUALITY, DEFAULT_PITCH | FF_DONTCARE, "Arial"); m_list.SetFont(&m_font); /*-----------------------------------------------------------*/ m_list.SetExtendedStyle(LVS_EX_FULLROWSELECT | LVS_EX_GRIDLINES); m_list.SetBkColor(RGB(247,247,255)); m_list.SetTextColor(RGB(0,0,255)); m_list.SetTextBkColor(RGB(247,247,255)); m_list.InsertColumn(0, "序号", LVCFMT_LEFT, 50); m_list.InsertColumn(1, "公交路线", LVCFMT_LEFT, 350); m_list.InsertColumn(2, "下载状态", LVCFMT_LEFT, 150); m_pp.SetRange(1,MAX+1); m_pp.SetPos(0); m_pp.SetStep(1); return TRUE; } void CTestDlg::OnButton1() { CString strURL; strURL="http://www.baidu.com/img/baidu_logo.gif"; int nIndex=m_list.InsertItem(0xffff,"0",0); m_list.SetItemText(nIndex,1,strURL); if(::URLDownloadToFile(NULL,strURL,"baidu_logo.gif",0,NULL) == S_OK) { m_list.SetItemText(0,2,"文件下载完成!"); } else { m_list.SetItemText(0,2,"文件下载失败..."); } GetBusInfo(); SaveToFile(); MessageBox("下载完成!", "公交信息", MB_ICONASTERISK | MB_OK); } void CTestDlg::OnKillfocusEdit1() { } CString getHTML(CString strURL) { CInternetSession mySession(NULL,0); CHttpFile* myHttpFile=NULL; CString strHtml=""; CString myData; myHttpFile=(CHttpFile*)mySession.OpenURL(strURL); while(myHttpFile->ReadString(myData)) { strHtml += myData + "/n"; } strHtml += myData; // 有时候明明读取到内容了,但 ReadString 返回了 FASLE /* // 通过 Read 方法读取文本 const int size = 1024; byte pByte[size]; int count = 0; vector<byte> vecByte; CString strUpdateInfo; while( (count = myHttpFile->Read(pByte, size)) > 0 ) { for(int i = 0; i < count; ++i) { vecByte.push_back(pByte[i]); } if( count < size ) break; } if( vecByte.size() > 0 ) { byte * pB = new byte[vecByte.size()]; copy(vecByte.begin(),vecByte.end(),pB); TCHAR * pChr = (TCHAR*)pB; strHtml = pChr; delete [] pB; } */ myHttpFile->Close(); mySession.Close(); WriteFile(strHtml, "3.txt"); return strHtml; } // 写文件 void WriteFile(const char * pszContent, const char * pszFilename) { FILE * fp; if( (fp = fopen(pszFilename, "w+t")) != NULL) { fwrite(pszContent, sizeof(char), strlen(pszContent), fp); fclose(fp); } } void CTestDlg::GetBusInfo() { CString url = STR_HTTP_URL; CString tmp; /*static*/ char szTmp[1024*16]; // 使用静态变态,不占用函数栈中的内存 queue<string> queBus; queBus.push(STR_BUS); //#define SAVE_FILE #ifdef SAVE_FILE FILE * fp; if( (fp = fopen(GetFileName(), "w+t")) == NULL) return; #endif Bus_t t; t.bSaved = false; t.url = STR_URL; t.busName = STR_BUS; m_mapBus.insert(pair<string, Bus_t>(FormatBusLineToOrder(STR_BUS), t)); int nL; int i = 1; int j = 1; // 如果不下载 baidu_logo.gif, 这里改为 0 CString strI; strI.Format("%d",i++); int nIndex=m_list.InsertItem(0xffff,strI,0); m_list.SetItemText(nIndex,1,STR_BUS); UpdateWindow(); CString strBusLine; // 线路名称,为取回程信息而用 while(!queBus.empty()) { string b = queBus.front(); queBus.pop(); map<string, Bus_t>::iterator ibus = m_mapBus.find(FormatBusLineToOrder(b.c_str())); if (ibus != m_mapBus.end()) { Bus_t & bsecond = ibus->second; CString html = getHTML(url + bsecond.url.c_str()); // 解析html int pos = html.Find(bsecond.url.c_str()); if (pos != -1) { html.Delete(0, pos); while(html.GetAt(0) != '>') html.Delete(0); if (html.GetAt(0) == '>') html.Delete(0); // 线路名称 int p1 = html.Find(STR_HREF_END); if (p1 != -1) { strBusLine = html.Left(p1); } } // 取 线路简介 信息 pos = html.Find(STR_GO); if (pos != -1) { tmp = html.Left(pos); nL = tmp.GetLength(); strcpy(szTmp, tmp); // 需要注意不要拷贝越界 StripTags(szTmp); bsecond.info = szTmp; // 简介 html.Delete(0, pos); while(html.GetAt(0) != '>') html.Delete(0); if (html.GetAt(0) == '>') html.Delete(0); } else { // 对单向行驶线路的处理 pos = html.Find(STR_SINGLE); if(pos != -1) { tmp = html.Left(pos); nL = tmp.GetLength(); strcpy(szTmp, tmp); // 需要注意不要拷贝越界 StripTags(szTmp); bsecond.info = szTmp; // 简介 html.Delete(0, pos); while(html.GetAt(0) != '>') html.Delete(0); if (html.GetAt(0) == '>') html.Delete(0); } } // 取 去程 信息 pos = html.Find(STR_BACK); if (pos != -1) { tmp = html.Left(pos); nL = tmp.GetLength(); strcpy(szTmp, tmp); StripTags(szTmp); // 去程 SplitStations(szTmp, bsecond.stationsGo); html.Delete(0, pos); while(html.GetAt(0) != '>') html.Delete(0); if (html.GetAt(0) == '>') html.Delete(0); } // 取 回程 信息 pos = html.Find(strBusLine); if (pos != -1) { tmp = html.Left(pos+strlen(strBusLine)); nL = tmp.GetLength(); strcpy(szTmp, tmp); StripTags(szTmp); // 回程 SplitStations(szTmp, bsecond.stationsBack); html.Delete(0, pos); while(html.GetAt(0) != '>') html.Delete(0); if (html.GetAt(0) == '>') html.Delete(0); } bsecond.bSaved = true; // 该线路已经取到了 #ifdef SAVE_FILE SaveToFile(fp, bsecond); #endif m_list.SetItemText(j++,2,"下载完成!"); m_pp.SetPos(j-1); UpdateWindow(); // 获取其他线路的链接 pos = html.Find(STR_LINE); if (pos != -1) { // 相关线路 html.Delete(0, pos); pos = html.Find("</div>"); if (pos != -1) { html.Delete(pos, html.GetLength()-pos); } map<string, Bus_t>::iterator ibusT; pos = html.Find(STR_HREF_END); while(pos != -1) { tmp = html.Left(pos); html.Delete(0, pos+strlen(STR_HREF_END)); int p1 = tmp.Find("/""); tmp.Delete(0, p1+1); p1 = tmp.Find("/""); CString href = tmp.Left(p1); strcpy(szTmp, tmp); szTmp[0] = '<'; StripTags(szTmp); ibusT = m_mapBus.find(FormatBusLineToOrder(szTmp)); // 之前没有该信息时,才记录,防重复 if (ibusT == m_mapBus.end()) { t.url = href; t.busName = szTmp; m_mapBus.insert(pair<string, Bus_t>(FormatBusLineToOrder(szTmp), t)); queBus.push(szTmp); strI.Format("%d",i++); int nIndex=m_list.InsertItem(0xffff,strI,0); m_list.SetItemText(nIndex,1,szTmp); m_pp.SetRange(1,i); m_pp.SetPos(j-1); UpdateWindow(); } pos = html.Find(STR_HREF_END); } } } } #ifdef SAVE_FILE fclose(fp); #endif } void CTestDlg::SaveToFile() { FILE * fp; if( (fp = fopen(GetFileName(), "w+t")) != NULL) { map<string, Bus_t>::iterator ibus = m_mapBus.begin(); for (; ibus != m_mapBus.end(); ++ibus) { Bus_t & t = ibus->second; SaveToFile(fp, t); } fclose(fp); } } // 2011-2-9 begin void CTestDlg::SaveToFile(FILE * fp, Bus_t & t) { char szCont[128]; // 需要谨慎,防止数组越界 // 线路 fwrite(t.busName.c_str(), sizeof(char), t.busName.length(), fp); fwrite("/n", 1, 1, fp); // 简介 fwrite(" ", 1, 1, fp); fwrite(t.info.c_str(), sizeof(char), t.info.length(), fp); list<string>::iterator iSt; // 判断是单向还是 双向 线路 if (!t.stationsGo.empty()) { sprintf(szCont, "/n 去程/n "); fwrite(szCont, 1, strlen(szCont), fp); for (iSt=t.stationsGo.begin(); iSt!=t.stationsGo.end(); ++iSt) { fwrite(iSt->c_str(), 1, iSt->length(), fp); fwrite(" ", 1, 1, fp); } sprintf(szCont, "%d站/n 回程/n ", t.stationsGo.size()); fwrite(szCont, 1, strlen(szCont), fp); } else { sprintf(szCont, "/n 单向行驶/n "); fwrite(szCont, 1, strlen(szCont), fp); } for (iSt=t.stationsBack.begin(); iSt!=t.stationsBack.end(); ++iSt) { fwrite(iSt->c_str(), 1, iSt->length(), fp); fwrite(" ", 1, 1, fp); } sprintf(szCont, "%d站/n", t.stationsBack.size()); fwrite(szCont, 1, strlen(szCont), fp); } char * StrToInt(const char * str, int & n) { n = 0; char * s = (char *)str; while (*s && (*s<'0'|| *s>'9')) ++s; while (*s && *s>='0' && *s<='9') { n = n*10 + *s - '0'; ++s; } return s; } string FormatBusLineToOrder(const char sLine[]) { char sNum[10]; char sFormat[128]; const char *p = sLine; char *q = sFormat; while(*p!='/0') { while(*p!='/0' && (*p<'0' || *p>'9') ) *q++ = *p++; int n; while(*p!='/0' && *p>='0' && *p<='9') { p = StrToInt(p, n); *q = '/0'; q += sprintf(sNum, "%03d", n); strcat(sFormat, sNum); } } *q = '/0'; return sFormat; } char * CTestDlg::GetFileName() { return ::GetFileName(STR_HTTP_URL); } // url 是以'http://' 开始的网址 char * GetFileName(const char * url) { const int SIZE = 128; static char szFile[SIZE+1]; if (strlen(url) < SIZE) { strcpy(szFile, url); } else { strncpy(szFile, url, SIZE); szFile[SIZE] = '/0'; } char *p = szFile; char *q = szFile+7; // 7 is the length of 'http://' while(*p!='/0' && *p!='.') ++p; if (*p == '.') { ++p; *p++ = 't'; *p++ = 'x'; *p++ = 't'; *p = '/0'; } return q; } // 2011-2-9 end // StripTags() rips through a buffer and removes HTML tags from it. // The function uses a static variable to remember its state in case // a HTML tag spans a buffer boundary. void StripTags(LPTSTR pszBuffer) { static BOOL bInTag = FALSE; LPTSTR pszSource = pszBuffer; LPTSTR pszDest = pszBuffer; while (*pszSource != '/0') { if (bInTag) { if (*pszSource == '>') bInTag = FALSE; pszSource++; } else { if (*pszSource == '<') bInTag = TRUE; else { *pszDest = *pszSource; pszDest++; } pszSource++; } } *pszDest = '/0'; } void SplitStations(char * pstation, list<string>& lstStation) { const int LENTH = 256; char st[LENTH]; // 站台名称不应该太长 int i; while(*pstation != '/0') { i = 0; while(*pstation && *pstation == ' ') pstation++; // 过滤前导空格 while(*pstation && *pstation != '-' && i<LENTH) st[i++] = *pstation++; // 异常处理 if (i==LENTH) break; while (i>1 && st[i-1]==' ') i--; // 过滤末尾空格 st[i] = '/0'; if (*pstation == '-') pstation++; lstStation.push_back(st); } // 删除 最后一个站台中的 "(xx站)" 信息 if (!lstStation.empty()) { CString strTotal; strTotal.Format("(%d站)", lstStation.size()); list<string>::iterator iSt = --lstStation.end(); size_t pos = iSt->find(strTotal); if(pos != string::npos) { iSt->resize(pos); while(iSt->at(iSt->length()-1)==' ') iSt->erase(iSt->length()-1); } } }