Jsoup网页爬虫案例

12 篇文章 0 订阅

       最近我的一个软件要改版。做了一个demo用于演示。在这分享给大家。共同学习如何抓取HTML代码


package cn.oschina.net;

import android.app.Activity;
import android.app.ProgressDialog;
import android.content.Context;
import android.content.DialogInterface;
import android.content.Intent;
import android.os.AsyncTask;
import android.os.Bundle;
import android.util.Log;
import android.view.View;
import android.widget.AdapterView;
import android.widget.ListView;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

public class MainActivity extends Activity {
    private static final String TAG ="MainActivity" ;
    ListView listView;
    TitleAdapter mAdapter;

    /** Called when the activity is first created. */
    @Override
    public void onCreate(Bundle savedInstanceState) {
        super.onCreate(savedInstanceState);
        setContentView(R.layout.login);
        listView= (ListView) findViewById(R.id.lv_main);
        new PageTask(this).execute();
    }


    private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> {
        // 可变长的输入参数,与AsyncTask.exucute()对应
        ProgressDialog pdialog;
        MyCache cache ;
        public PageTask(Context context) {
            cache=(MyCache) context.getApplicationContext();
            pdialog = new ProgressDialog(context, 0);
            pdialog.setTitle("conneting....");
            pdialog.setButton("cancel", new DialogInterface.OnClickListener() {
                public void onClick(DialogInterface dialog, int i) {
                    dialog.cancel();
                }
            });
            pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() {
                public void onCancel(DialogInterface dialog) {
                    finish();
                }
            });
            // pdialog.setCancelable(true);
            pdialog.setMax(100);
            pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);
            pdialog.show();

        }

        @Override
        protected List<Map<String, Object>> doInBackground(String... params) {
            List<Map<String, Object>> arr = null;
            try {
                int count=1;
                int length=100;
                if(cache.getCacheDoc().containsKey("doc")){
                    SoftReference<List<Map<String, Object>>> soft= cache.getCacheDoc().get("doc");
                    arr = soft.get();
                    for(int i=1;i<=100;i++){
                        publishProgress((int) ((i / (float) length) * 100));
                    }
                    Log.v("OSchina-","read cache");
                }else{
                    Log.v("OSchina-","not read cache");
                    arr = new ArrayList<Map<String,Object>>();
                    Document doc = Jsoup.connect(
                            "http://fotomen.cn/")
                            .timeout(30000).post();
                    Log.v("OSchina-","request over:"+(doc!=null));
                    Elements titleElement = doc.select("div.cb-article-meta");
                    for(int i=0; i<titleElement.size();i++){
                        String title= titleElement.get(i).select("h2").text();
                        String linkHref = titleElement.get(i).getElementsByTag("a").attr("href");
                        Log.d(TAG,"title====="+title);
                        Log.d(TAG,"linkHref====="+linkHref);
                        HashMap<String,Object> hashMap = new HashMap<String, Object>();
                        hashMap.put("title",title);  //标题
                        hashMap.put("url",linkHref);   //文章url
                        arr.add(hashMap);
                     }
                    cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr));
                }
            } catch (Exception e) {
                Log.e("doInBackground", "--"+e);
            }
            return arr;

        }

        @Override
        protected void onCancelled() {
            super.onCancelled();
        }

        @Override
        protected void onPostExecute(final List<Map<String, Object>> result) {
            final List<Map<String, Object>> data = result;
         pdialog.dismiss();
                if(result!=null){
                    mAdapter=new TitleAdapter(MainActivity.this,result);
                    listView.setAdapter(mAdapter);
                    listView.setOnItemClickListener(new AdapterView.OnItemClickListener() {
                        @Override
                        public void onItemClick(AdapterView<?> adapterView, View view, int i, long l) {
                            Intent intent = new Intent(MainActivity.this,OSchinaMain.class);
                            intent.putExtra("url",data.get(i).get("url").toString());
                            startActivity(intent);
                        }
                    });
                }else{
                    Log.d(TAG,"result====null");
                }
        }

        @Override
        protected void onPreExecute() {
            // 任务启动,可以在这里显示一个对话框,这里简单处理
            // message.setText(R.string.task_started);
        }

        @Override
        protected void onProgressUpdate(Integer... values) {
            // 更新进度
            //System.out.println("" + values[0]);
            // message.setText(""+values[0]);
            pdialog.setProgress(values[0]);
        }
    }

}

package cn.oschina.net;
import java.lang.ref.SoftReference;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import android.content.Intent;
import android.widget.LinearLayout;
import android.widget.TextView;
import android.app.Activity;
import android.app.ProgressDialog;
import android.content.Context;
import android.content.DialogInterface;
import android.os.AsyncTask;
import android.os.Bundle;
import android.util.Log;
import android.widget.Toast;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;


public class OSchinaMain extends Activity {

    private static final String TAG ="OSchinaMain" ;
    private TextView textView;
    private LinearLayout ll_all;
    Map<String,SoftReference<List<Map<String, Object>>>> cacheDoc;

    /*public OSchinaMain(){
		MyCache cache = (MyCache) getApplicationContext();
        cacheDoc= cache.getCacheDoc();
	}*/
	
	@Override
	public void onCreate(Bundle savedInstanceState) {
		super.onCreate(savedInstanceState);
		setContentView(R.layout.main);
        ll_all= (LinearLayout) findViewById(R.id.ll_all);
        textView= (TextView) findViewById(R.id.tv_readContent);
        Intent intent =getIntent();
        String url=intent.getStringExtra("url");
		PageTask task = new PageTask(this,url);
		task.execute(null);
	}


	private class PageTask extends AsyncTask<String, Integer, List<Map<String, Object>>> {
		// 可变长的输入参数,与AsyncTask.exucute()对应
		ProgressDialog pdialog;
		MyCache cache ;
        String url;
		public PageTask(Context context,String url) {
			cache=(MyCache) context.getApplicationContext();
            this.url=url;
            Log.v("OSchina-","url=="+url);
			pdialog = new ProgressDialog(context, 0);
			pdialog.setTitle("正在连接请稍候....");
			pdialog.setButton("cancel", new DialogInterface.OnClickListener() {
				public void onClick(DialogInterface dialog, int i) {
					dialog.cancel();
				}
			});
			pdialog.setOnCancelListener(new DialogInterface.OnCancelListener() {
				public void onCancel(DialogInterface dialog) {
					finish();
				}
			});
			pdialog.setMax(100);
			pdialog.setProgressStyle(ProgressDialog.STYLE_HORIZONTAL);
			pdialog.show();

		}

		@Override
		protected List<Map<String, Object>> doInBackground(String... params) {
			List<Map<String, Object>> arr = null;
			try {
				
				int count=1;
				int length=100;
					Log.v("OSchina-","不走缓存");
					arr = new ArrayList<Map<String,Object>>();
					Document doc = Jsoup.connect(
                            url)
						.timeout(8000).post();
                    Log.v("OSchina-","请求结束:"+(doc!=null));

                    //文章时间
                    String  contentTime=  doc.select("span.cb-title-fi").select("time.updated").get(0).text();
                    //作者
                    String author= doc.select("span.fn").get(0).text();
                    //标题
                    String title= doc.select("span.cb-title-fi").select("h1").get(0).text();
                    //文章内容(html 包含图片地址  p标签等)
                    String article = doc.select("article").attr("section", "articleBody").text();

                    Log.d("OSchina-","title:"+title);
                    Log.d("OSchina-","author:"+author);
                    Log.d("OSchina-","contentTime:"+contentTime);
                    Log.d("OSchina-","article:"+article);


                    HashMap<String,Object> hashMap = new HashMap<String, Object>();
                    hashMap.put("title",title);
                    hashMap.put("author",author);
                    hashMap.put("contentTime",contentTime);
                    hashMap.put("article",article);
                    arr.add(hashMap);
					cache.getCacheDoc().put("doc", new SoftReference<List<Map<String, Object>>>(arr));
					


			} catch (Exception e) {
				Log.e("doInBackground", "--"+e);
                Toast.makeText(OSchinaMain.this,"连接超时",1).show();
                finish();
            }
			return arr;

		}

		@Override
		protected void onCancelled() {
			super.onCancelled();
		}

		@Override
		protected void onPostExecute(List<Map<String, Object>> result) {

             for(int i=0;i<result.size();i++){
                 String title = (String) result.get(i).get("title");
                 String author = (String) result.get(i).get("author");
                 String contentTime = (String) result.get(i).get("contentTime");
                 String article = (String) result.get(i).get("article");
                 textView.append(title+"\n");
                 textView.append(author+"\n");
                 textView.append(contentTime+"\n");
                 textView.append(article);
                 ll_all.removeAllViews();
                 ll_all.addView(textView);
             }

			// 返回HTML页面的内容

			pdialog.dismiss();
		}

		@Override
		protected void onPreExecute() {
			// 任务启动,可以在这里显示一个对话框,这里简单处理
			// message.setText(R.string.task_started);
		}

		@Override
		protected void onProgressUpdate(Integer... values) {
			// 更新进度
			//System.out.println("" + values[0]);
			// message.setText(""+values[0]);
			pdialog.setProgress(values[0]);
		}

	}


	
	


}

以fotomen.cn为例,首页取出最新的几篇文章展示在列表,点列表可以展示文章的具体内容。




      


此demo的下载地址是:http://download.csdn.net/detail/ligl0702/7001333



  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值