本博客主要写的是爬取电影天堂的电影链接,详细代码如下:
package p80s;
import java.io.IOException;
import java.net.MalformedURLException;
import java.sql.SQLException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import util.JDBCUtil;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.NicelyResynchronizingAjaxController;
import com.gargoylesoftware.htmlunit.Page;
import com.gargoylesoftware.htmlunit.TextPage;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomElement;
import com.gargoylesoftware.htmlunit.html.HtmlAnchor;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlElement;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class _movice {
private static JDBCUtil con = new JDBCUtil("gypt");
private static int movicenum = 0;
private static List<Movice> movices = new ArrayList<Movice>();
private static int savenum = 50;
/**
* @param args
* @throws SQLException
* @throws InterruptedException
*/
public static void main(String[] args){
// String uri = "http://www.dytt8.net/html/gndy/china/index.html";
// String uri = "http://www.dytt8.net/html/gndy/china/list_4_95.html";
String uri = "http://www.ygdy8.net/html/gndy/oumei/index.html";
try {
WebClient webclient = new WebClient(BrowserVersion.CHROME);
webclient.getOptions().setJavaScriptEnabled(true); // 启动JS
webclient.getOptions().setUseInsecureSSL(true);//忽略ssl认证
webclient.getOptions().setCssEnabled(false);//禁用Css,可避免自动二次请求CSS进行渲染
webclient.getOptions().setThrowExceptionOnScriptError(false);//运行错误时,不抛出异常
webclient.setAjaxController(new NicelyResynchronizingAjaxController());// 设置Ajax异步
webclient.getOptions().setThrowExceptionOnFailingStatusCode(false);
HtmlPage pag