校园助手APP--爬取教务处网页，并解析出数据

本文链接：https://blog.csdn.net/brian512/article/details/41079829

在使用服务器前，我是直接爬取网页数据的，包括做到最后也还是采用直接爬教务处网页获得通知内容的。这种方式在有些时候也是会用得上的，在此介绍一下

public class InternetHelper {

	private static final String TAG = "InternetHelper";
	
	
	public static final String URL_BASE = "http://61.183.207.40/zjdxgc/(kgd5dczwtsnv50yznsqeuh55)/";
	public static final String USER_AGENT = "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)";
	public static final String HOST = "http://61.183.207.40";
	
	private static CookieStore cookie = null;

	/**
	 * 单例模式，生成HttpClient对象，并进行请求参数封装
	 * @return HttpClient对象
	 */
	public static DefaultHttpClient getClient(){
		DefaultHttpClient client = null;
		if (null == client) {
			HttpParams httpParams = new BasicHttpParams();
			httpParams.setParameter("http.protocol.allow-circular-redirects", Boolean.valueOf(true));
			httpParams.setParameter(ClientPNames.COOKIE_POLICY, CookiePolicy.BEST_MATCH);
			HttpClientParams.setRedirecting(httpParams, true);
			
			//设置编码
			HttpProtocolParams.setContentCharset(httpParams, HTTP.UTF_8);
			HttpProtocolParams.setUseExpectContinue(httpParams, true);
			HttpProtocolParams.setUserAgent(httpParams, USER_AGENT);
			
			HttpConnectionParams.setTcpNoDelay(httpParams, true);
			//关闭旧连接检查，提升速度
			HttpConnectionParams.setStaleCheckingEnabled(httpParams, false);
//			//从连接池中取连接的超时时间
//			ConnManagerParams.setTimeout(httpParams, 1000);
//			//连接超时
//			HttpConnectionParams.setConnectionTimeout(httpParams, 2000);
//			//请求超时
//			HttpConnectionParams.setSoTimeout(httpParams, 4000);
			
			//设置httpClient支持HTTP和HTTPS两种模式
			SchemeRegistry schReg = new SchemeRegistry();
			schReg.register(new Scheme("http", PlainSocketFactory.getSocketFactory(), 80));
			schReg.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443));
			
			//使用线程安全的连接管理
			ClientConnectionManager conMgr = new ThreadSafeClientConnManager(httpParams, schReg);
			
			client = new DefaultHttpClient(conMgr, httpParams);
		}
		
		if (null != cookie) {
			client.setCookieStore(cookie);
			Log.i(TAG, cookie.toString());
		}
		
		return client;
	}
	

	/**
	 * 通过URL获取含有新闻的网页源码
	 * @param url_news
	 * @return
	 * @throws Exception
	 */
	public static String getNewsHtmlByURL(String url_news) throws Exception {
		
		DefaultHttpClient client = InternetHelper.getClient();
		
		HttpGet localHttpGet = new HttpGet(url_news);
		String referer = url_news;

		localHttpGet.setHeader("Referer", referer);
		localHttpGet.setHeader("User-Agent", USER_AGENT);

		HttpResponse httpResponse = client.execute(localHttpGet);
		int statusCode = httpResponse.getStatusLine().getStatusCode();
		System.out.println("statusCode————————————————>" + statusCode);
		
		String html = null;
		
		if (statusCode == 400) {
			HttpEntity localHttpEntity = httpResponse.getEntity();
			System.out.println("出错了，400，下面是得到的html代码："+ EntityUtils.toString(localHttpEntity, "gb2312"));
			return null;
		}else if (statusCode == 302) {	//网页跳转
			
			//从头信息中获取跳转地址
			Header[] arrayOfHeader = httpResponse.getHeaders("Location");
			String location = HOST + arrayOfHeader[0].getValue();
			
			HttpGet httpGet = new HttpGet(location);
			httpGet.setHeader("Referer", location);
			html = EntityUtils.toString(client.execute(httpGet).getEntity(), "gb2312");
			httpGet.abort();
		} else if (statusCode == 200){
			html = EntityUtils.toString(httpResponse.getEntity(),"gb2312");
		}
		return html;
	}

}

直接访问网页需要添加一些头信息，具体需要哪些头信息就需要通过抓包来得到，换了几次电脑，忘了当时是用的chrome的什么插件

这样获取得到的是整个网页的源码，需要解析源码获得需要的信息，首先是教务处通知的列表信息，包含通知的标题以及相应的链接

	/**
	 * 通过获取的含有新闻标题的网页源码获取新闻列表
	 * @param html
	 * @return
	 */
	private static List<News> getNewsByHtml(String html) {

		List<News> newsList = new ArrayList<News>();
		String reg = "(<a href=\"Content.asp\\?c=14&a=[^>]*&todo=show\"  target=\"_blank\">).*?(</a>)";

		Pattern pattern = Pattern.compile(reg);
		Matcher matcher = pattern.matcher(html);

		while (matcher.find()) {
			String titleStr = matcher.group(0);
			String title = titleStr.substring(titleStr.indexOf('】')+1, titleStr.indexOf("</"));
			String source = titleStr.substring(titleStr.indexOf('【')+1, titleStr.indexOf('】'));
			String url = "http://jwc.wit.edu.cn/" + titleStr.substring(titleStr.indexOf("href=\"")+"href=\"".length(), titleStr.indexOf("todo=show")+"todo=show".length());
			
			News news = new News();//封装为news对象
			news.setTitle(title);
			news.setSource(source);
			news.setUrl(url);
			
			newsList.add(news);
		}
		return newsList;
	}

不断的分析源码，通过正则表达式获得标题及链接，封装为一个列表显示出来，点击列表中的某一条标题，就跳到该通知的详细内容页面。

通过链接可以获取到详细通知的网页源码，由于源码中包含太多的标签信息，为方便找到正文开始的地方，就通过通知的标题来定位：

	/**
	 * 通过点击的新闻标题得到News实体
	 * @param title
	 * @return
	 * @throws Exception 
	 */
	public News getNewsByTitle(String title) throws Exception {
		
		SQLiteDatabase database = dbHelper.getWritableDatabase();
		Cursor cursor = database.query(News.TABLE_NAME, new String[]{News.ID,News.URL,News.SOURCE,News.TIME,News.CONTENT}, News.TITLE+"=?", new String[]{title}, null, null, null);
		
		News news = new News();
		if(cursor.moveToFirst()){
			news.setId(cursor.getLong(cursor.getColumnIndex(News.ID)));
			news.setTime(cursor.getString(cursor.getColumnIndex(News.TIME)));
			news.setSource(cursor.getString(cursor.getColumnIndex(News.SOURCE)));
			news.setContent(cursor.getString(cursor.getColumnIndex(News.CONTENT)));
			news.setUrl(cursor.getString(cursor.getColumnIndex(News.URL)));
			news.setTitle(title);
		}
		
		//如果数据库中的news数据没有新闻主体部分则联网获取
		if (null == news.getContent()) {
			String url = news.getUrl();
		
			String html = InternetHelper.getNewsHtmlByURL(url);
			
			html = html.substring(html.indexOf(title)+title.length());//去头
			String regexstr = "<(?!p|/p).*?>";
			html = html.replaceAll(regexstr, "");//去HTML标签
			html = html.replace(" ", " ");//去空格
			
			//获取新闻内容
			String content = html.substring(html.indexOf("教务处")+3, html.lastIndexOf("机构设置")).trim();//去尾
			//获取发布时间
			String time = html.substring(html.indexOf("发布时间：")+5, html.indexOf("点击次数"));
			
			news.setTime(time);
			news.setContent(content);
			
			//更新数据库中的数据，主要是加入content和time
			updateNews(news);
		}
		database.close();
		return news;
	}

去掉网页源码的头尾后，就能够清晰地看到内容部分及一些网页标签，再次通过正则表达式解析出正文内容。

因为获取到通知信息后，我是存到数据库的，所以初始化时会先从数据库获取信息，只是还没来得及做列表刷新