spider-java (Jsoup) (媒体信息的爬取)

媒体基础信息爬取实例

GetAppname.java (代码为hive的udf,静态页面的获取)
package com.hb.hive.utils;

import java.util.Random;

import org.apache.hadoop.hive.ql.exec.UDF;
import org.apache.hadoop.io.Text;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

public class GetAppname extends UDF {
	/**
	 * 各大应用市场获取app名称
	 * 
	 * @param Str
	 * @return
	 */
	public Text evaluate(Object... args) {
		if (args.length == 1) {
			return new Text(getAppName(args[0].toString()));
		} else {
			return new Text(getAppName(args[0].toString()));
		}
	}

	public static String getAppName(String app_id) {
		try {
			int t =new Random().nextInt(10)*3000;
			System.out.println(t);
			Thread.sleep( t);
		} catch (InterruptedException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		String app_name = "";
		app_name = getAppNameByYingyongbao(app_id);
		if (app_name.equals("")) {
			app_name = getAppNameByXiaomi(app_id);
			if (app_name.equals("")) {
				app_name = getAppByWandoujia(app_id);
				if (app_name.equals("")&&app_id.matches("[0-9]+")) {
					app_name = getAppNameByAso100(app_id);
				}
			}
		}
		
		return app_name;
	}

	/**
	 * from Yingyonggao
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppNameByYingyongbao(String app_id) {
		try {

			Document doc = Jsoup.connect("http://android.myapp.com/myapp/detail.htm?apkName=" + app_id).get();
			Elements elementsByClass = doc.getElementsByClass("det-name-int");
			if (!elementsByClass.text().toString().equals("")) {
				return elementsByClass.text().toString();
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";

		}
		return "";
	}

	/**
	 * from Xiaomi
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppNameByXiaomi(String app_id) {
		try {

			Document doc = Jsoup.connect("http://app.mi.com/details?id=" + app_id).get();
			Elements elementsByClass = doc.getElementsByClass("yellow-flower");
			if (!elementsByClass.attr("alt").equals("")) {
				return elementsByClass.attr("alt");
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";

		}
		return "";
	}

	/**
	 * from Aso100
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppNameByAso100(String app_id) {
		try {

			Document doc = Jsoup.connect("https://aso100.com/app/rank/appid/" + app_id ).get();
			Elements elementsByClass = doc.getElementsByClass("name-str");
			if (!elementsByClass.text().toString().equals("")) {
				return elementsByClass.text().toString();
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";

		}
		return "";
	}

	/**
	 * from Wandoujia
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppByWandoujia(String app_id) {
		try {

			Document doc = Jsoup.connect("http://www.wandoujia.com/apps/" + app_id).get();
			Elements elementsByClass = doc.getElementsByClass("app-name");
			if (!elementsByClass.text().toString().equals("")) {
				return elementsByClass.text().toString();
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";
		}
		return "";
	}
	/**
	 * from Itunes
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppByItunes(String app_id) {
		try {

			Document doc = Jsoup.connect("https://itunes.apple.com/cn/app/id" + app_id).get();
			Elements elementsByClass = doc.getElementsByClass("artwork");
			if (!elementsByClass.attr("alt").equals("")) {
				return elementsByClass.attr("alt");
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";

		}
		return "";
	}

	/**
	 * from Googleplay 需要翻墙
	 * 
	 * @param app_id
	 * @return
	 */
	public static String getAppByGoogleplay(String app_id) {
		try {
			Document doc = Jsoup.connect("https://play.google.com/store/apps/details?id=" + app_id + "&hl=zh_CN").get();
			Elements elementsByClass = doc.getElementsByClass("id-app-title");
			if (!elementsByClass.text().toString().equals("")) {
				return elementsByClass.text().toString();
			}
		} catch (Exception e) {
			System.out.println(e);
			return "";
		}
		return "";
	}

	@SuppressWarnings("static-access")
	public static void main(String[] args) {
//		System.out.println(getAppByItunes("1000114190"));
		System.out.println(new GetAppname().getAppName("com.yr.mmpic"));
//		 System.out.println(new
//		 GetAppname().getAppByGoogleplay("com.mandongkeji.comiclover"));
//		 System.out.println(new
//		 GetAppname().getAppByWandoujia("com.mandongkeji.comiclover"));
//		 System.out.println(new
//		 GetAppname().getAppNameByAso100("com.mandongkeji.comiclover"));		
	}
}


结合phantomjs实现Jsoup动态页面的获取
phantomjs-2.1.1-macosx+java
具体的搭建方式请百度
在安装目录(/usr/local/share/phantomjs-2.1.1-macosx/code.js)下添加code.js用于获取动态页面
code.js
system = require('system')
address = system.args[1];
var page = require('webpage').create();
var url = address;
page.open(url, function (status) {
    //Page is loaded!
    if (status !== 'success') {
        console.log('Unable to post!');
    } else {
            window.setTimeout(function () {
              page.render("test1.png");  //截图
              console.log(page.content);
              phantom.exit();
          }, 5000);
    }
  });
执行命令获取动态页面:phantomjs ./phantomjs-2.1.1-macosx/code.js https://www.qimai.cn/andapp/baseinfo/appid/120023
具体结合方式参考
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值