使用htmlunit 获取百度新闻搜索结果的url

public class baiduNewsPost {
	public static void main(String args[])
			throws FailingHttpStatusCodeException, MalformedURLException,
			IOException, InterruptedException {
		final WebClient webclient = new WebClient();
		final HtmlPage htmlpage = webclient
				.getPage("http://news.baidu.com/");

		webclient.getOptions().setCssEnabled(false);
		webclient.getOptions().setJavaScriptEnabled(false);
		 System.out.println(htmlpage.asText());
		 System.out.println(htmlpage.getTitleText());
		final HtmlForm form = htmlpage.getFormByName("fbaidu");
		final HtmlSubmitInput button = form.getInputByValue("百度一下");
		final HtmlTextInput textField = form.getInputByName("word");
		String keyword = "***";
		textField.setValueAttribute(keyword);
		HtmlPage page = button.click();
		System.out.println(page);
		
		textField.setValueAttribute(keyword);
		HashSet<String> ts = new HashSet<String>();
                int pagenum = 1;
		while (page != null && pagenum < 38) {
			java.util.List<HtmlAnchor> achList = page.getAnchors();

			for (HtmlAnchor ach : achList) {
				String url = ach.getHrefAttribute();
				String s = url.substring(0, 1);
				String regex = ".*?baidu.*?";
				Pattern p = Pattern.compile(regex);
				Matcher m = p.matcher(url);
				if (s.equals("/") || s.equals("j") || url.length() > 100
						|| m.find()) {
					continue;
				}
				System.out.println(url);
				ts.add(url);
			}
			HtmlElement elepage = page.getHtmlElementById("page");
			HtmlElement nextpage = null;
			//获取下一页
			if(pagenum == 1){
				nextpage = (HtmlElement) elepage.getByXPath("//a[@class='n']")
					.get(0);
			}else if(elepage.getByXPath("//a[@class='n']").size() == 2){
				nextpage = (HtmlElement) elepage.getByXPath("//a[@class='n']")
						.get(1);
			}
			if(nextpage != null){
				//点击下一页
				page = nextpage.click();
			}else {
				page = null;
				continue;
			}
			System.out.println("pagenum :" + pagenum + nextpage.asText());
			pagenum ++;
			
			//生成随机睡眠时间,防止被百度屏蔽
			double time = Math.random()*45000;
			int sleeptime = (int)time + 30000;
			Thread.sleep(sleeptime);
		}
		webclient.closeAllWindows();
	}
}

发布了152 篇原创文章 · 获赞 0 · 访问量 8万+
展开阅读全文

JAVA htmlunit 抓取不到 页面 元素

02-23

下面代码中 能获取到百度页面的Form 表单, 但是获取不到另外一个 网页的表单, 此时该如何处理? ps:获取另外一个网页的源码保存html后,用浏览器打开,能看到 form 表单。 package com.xttx.cn.fetchpro.fetchImp; import java.io.IOException; import java.net.URL; import java.util.List; import com.gargoylesoftware.htmlunit.BrowserVersion; import com.gargoylesoftware.htmlunit.CollectingAlertHandler; import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException; import com.gargoylesoftware.htmlunit.HttpWebConnection; import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController; import com.gargoylesoftware.htmlunit.WebClient; import com.gargoylesoftware.htmlunit.WebRequest; import com.gargoylesoftware.htmlunit.html.HtmlForm; import com.gargoylesoftware.htmlunit.html.HtmlPage; import com.xttx.cn.fetchpro.exception.PageNotFoundException; public class SimulationFetch { protected static WebClient webClient = null; public WebClient getWebClient() { return webClient; } public void setWebClient(WebClient webClient) { this.webClient = webClient; } static WebRequest request = null; public static void main(String[] args) throws FailingHttpStatusCodeException, IOException, PageNotFoundException, InterruptedException { // TODO Auto-generated method stub BrowserVersion.FIREFOX_24.setBrowserLanguage("zh-CN"); BrowserVersion.FIREFOX_24.setSystemLanguage("zh-CN"); BrowserVersion.FIREFOX_24.setUserLanguage("zh-CN"); BrowserVersion.FIREFOX_24.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64; rv:46.0) Gecko/20100101 Firefox/46.0"); BrowserVersion.FIREFOX_24.setBrowserVersion(46.0f); BrowserVersion.FIREFOX_24.setCpuClass("x64"); webClient = new WebClient(BrowserVersion.FIREFOX_24); webClient.setWebConnection(new HttpWebConnection(webClient)); webClient.getCache().clear(); webClient.getCookieManager().clearCookies(); webClient.getOptions().setJavaScriptEnabled(true); webClient.setJavaScriptTimeout(60*1000); webClient.getOptions().setCssEnabled(true); webClient.getOptions().setActiveXNative(true); webClient.getOptions().setPopupBlockerEnabled(true); webClient.getOptions().setRedirectEnabled(true); webClient.getOptions().setTimeout(10000); webClient.getOptions().setDoNotTrackEnabled(true); webClient.getCookieManager().setCookiesEnabled(true); webClient.getOptions().setThrowExceptionOnFailingStatusCode(true); webClient.getOptions().setThrowExceptionOnScriptError(false); webClient.getOptions().setUseInsecureSSL(true); webClient.getOptions().setSSLInsecureProtocol("TLSv1.2"); webClient.setAjaxController(new NicelyResynchronizingAjaxController()); webClient.setAlertHandler(new CollectingAlertHandler()); HtmlPage loginPageWithForm =(HtmlPage)webClient.getPage(new WebRequest(new URL("https://login.youzan.com/sso/index?service=kdt&from_source=pzshouye"))); List<HtmlForm> htmlForm0 = loginPageWithForm.getForms(); HtmlPage loginPageNoForm = (HtmlPage)webClient.getPage(new WebRequest(new URL("https://www.baidu.com/"))); HtmlForm htmlForm = loginPageNoForm.getForms().get(0); System.out.println("---"); } } 问答

没有更多推荐了,返回首页

©️2019 CSDN 皮肤主题: 大白 设计师: CSDN官方博客

分享到微信朋友圈

×

扫一扫,手机浏览