步骤一:先用 httpclient 发起请求获取请求页面的 cookie, 以及其他参数
步骤二:利用获取到的参数构造 request 请求
步骤三:WebMagic 利用获取到的 cookie ,以及构造好的 request 发送 post 请求
//爬取视频页面信息
class VideoSpider {
public String cookie;
public String showMoreURL;
public String session_token;
public String client_url;
public VideoSpider(String url, String proxyStr) {
this.client_url = url;
String[] tmp = proxyStr.split(":");
HttpHost proxy = new HttpHost(tmp[1].substring(2), Integer.parseInt(tmp[2]), tmp[0]);
Site site = Site.me().setRetryTimes(3).setHttpProxy(proxy).setSleepTime(100).setTimeOut(10 * 1000).setCharset("UTF-8")
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
GPHttpClientDownloader downloader = new GPHttpClientDownloader();
Request request = new Request(this.client_url);
this.setCookie(request, site, downloader);
this.setParameters(request, site, downloader);
}
public void setCookie(Request request, Site site, GPHttpClientDownloader downloader) {
CloseableHttpResponse httpResponse = downloader.downloadForResponse(request, site.toTask());
Header headers[] = httpResponse.getHeaders("Set-Cookie");
this.cookie = "hl=en; ";
// this.cookie = "";
for (int i = 0; i < headers.length; i++) {
String tmp[] = headers[i].getValue().split(";");
this.cookie += tmp[0] + ";";
}
// System.out.println("cookie: " + this.cookie);
}
public void setParameters(Request request, Site site, GPHttpClientDownloader downloader) {
Html contentHtml = downloader.download(request, site.toTask()).getHtml();
this.showMoreURL = "https://www.youtube.com/watch_fragments_ajax?v=" + this.client_url.substring(32)+"&tr=time&distiller=1&ctoken=" + contentHtml.regex("'COMMENTS_TOKEN': \"(.*?)\"").toString() + "&frags=comments&spf=load";
this.session_token = contentHtml.regex("'XSRF_TOKEN': \"(.*?)\"").toString();
// System.out.println(showMoreURL);
// System.out.println(session_token);
}
public String getCookie() {
return cookie;
}
public String getShowMoreURL() {
return showMoreURL;
}
public String getSession_token() {
return session_token;
}
public String getClient_url() {
return client_url;
}
}
//爬取showMore信息
class ShowMoreSpider implements PageProcessor
{
private Site site;
public ShowMoreSpider(String proxyStr, String cookie)
{
String[] tmp = proxyStr.split(":");
HttpHost proxy = new HttpHost(tmp[1].substring(2), Integer.parseInt(tmp[2]), tmp[0]);
this.site = Site.me().setRetryTimes(3).setHttpProxy(proxy).addHeader("Cookie", cookie).setSleepTime(100).setTimeOut(10 * 1000).setCharset("UTF-8")
.setUserAgent("Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36");
}
@Override
public void process(Page page)
{
System.out.println(page.getJson().jsonPath("body").regex("u2022 (.*?)<").toString());
}
@Override
public Site getSite()
{
return site;
}
}
public class VideoCommentSpider {
public static String proxyString = "http://XXX.XX.XXX.XXX:XXXX";
public static void main(String[] args)
{
//获取Video页面的Cookie、ShowMoreURL、client_url、session_token
String url = "https://www.youtube.com/watch?v=Xo94zT93fAY";
VideoSpider vs = new VideoSpider(url, proxyString);
//获取comment
PageProcessor spider = new ShowMoreSpider(proxyString, vs.getCookie());
NameValuePair[] values = new NameValuePair[2];
values[0] = new BasicNameValuePair("client_url", vs.getClient_url());
values[1] = new BasicNameValuePair("session_token", vs.getSession_token());
Map nameValuePair = new HashMap();
nameValuePair.put("nameValuePair", values);
Request request = new Request(vs.getShowMoreURL());
request.setExtras(nameValuePair);
request.setMethod(HttpConstant.Method.POST);
Spider.create(spider).thread(5).addRequest(request).run();
}
}
@ThreadSafe
public class GPHttpClientDownloader extends AbstractDownloader {
private Logger logger = LoggerFactory.getLogger(getClass());
private final Map<String, CloseableHttpClient> httpClients = new HashMap<String, CloseableHttpClient>();
private HttpClientGenerator httpClientGenerator = new HttpClientGenerator();
private CloseableHttpClient getHttpClient(Site site) {
if (site == null) {
return httpClientGenerator.getClient(null);
}
String domain = site.getDomain();
CloseableHttpClient httpClient = httpClients.get(domain);
if (httpClient == null) {
synchronized (this) {
httpClient = httpClients.get(domain);
if (httpClient == null) {
httpClient = httpClientGenerator.getClient(site);
httpClients.put(domain, httpClient);
}
}
}
return httpClient;
}
@Override
public Page download(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = Sets.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return page;
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
return null;
}
} catch (IOException e) {
logger.warn("download page " + request.getUrl() + " error", e);
if (site.getCycleRetryTimes() > 0) {
return addToCycleRetry(request, site);
}
onError(request);
return null;
} finally {
request.putExtra(Request.STATUS_CODE, statusCode);
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consume(httpResponse.getEntity());
}
} catch (IOException e) {
logger.warn("close response fail", e);
}
}
}
public CloseableHttpResponse downloadForResponse(Request request, Task task) {
Site site = null;
if (task != null) {
site = task.getSite();
}
Set<Integer> acceptStatCode;
String charset = null;
Map<String, String> headers = null;
if (site != null) {
acceptStatCode = site.getAcceptStatCode();
charset = site.getCharset();
headers = site.getHeaders();
} else {
acceptStatCode = Sets.newHashSet(200);
}
logger.info("downloading page {}", request.getUrl());
CloseableHttpResponse httpResponse = null;
int statusCode=0;
try {
HttpUriRequest httpUriRequest = getHttpUriRequest(request, site, headers);
httpResponse = getHttpClient(site).execute(httpUriRequest);
statusCode = httpResponse.getStatusLine().getStatusCode();
request.putExtra(Request.STATUS_CODE, statusCode);
if (statusAccept(acceptStatCode, statusCode)) {
Page page = handleResponse(request, charset, httpResponse, task);
onSuccess(request);
return httpResponse;
} else {
logger.warn("code error " + statusCode + "\t" + request.getUrl());
return null;
}
} catch (IOException e) {
logger.warn("download page " + request.getUrl() + " error", e);
onError(request);
return null;
}
catch (Exception e) {
e.printStackTrace();
return null;
}
finally {
request.putExtra(Request.STATUS_CODE, statusCode);
try {
if (httpResponse != null) {
//ensure the connection is released back to pool
EntityUtils.consume(httpResponse.getEntity());
}
} catch (IOException e) {
logger.warn("close response fail", e);
}
}
}
@Override
public void setThread(int thread) {
httpClientGenerator.setPoolSize(thread);
}
protected boolean statusAccept(Set<Integer> acceptStatCode, int statusCode) {
return acceptStatCode.contains(statusCode);
}
protected HttpUriRequest getHttpUriRequest(Request request, Site site, Map<String, String> headers) {
RequestBuilder requestBuilder = selectRequestMethod(request).setUri(request.getUrl());
if (headers != null) {
for (Map.Entry<String, String> headerEntry : headers.entrySet()) {
requestBuilder.addHeader(headerEntry.getKey(), headerEntry.getValue());
}
}
RequestConfig.Builder requestConfigBuilder = RequestConfig.custom()
.setConnectionRequestTimeout(site.getTimeOut())
.setSocketTimeout(site.getTimeOut())
.setConnectTimeout(site.getTimeOut())
.setCookieSpec(CookieSpecs.BEST_MATCH);
if (site.getHttpProxyPool() != null && site.getHttpProxyPool().isEnable()) {
HttpHost host = site.getHttpProxyFromPool();
requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host);
}else if(site.getHttpProxy()!= null){
HttpHost host = site.getHttpProxy();
requestConfigBuilder.setProxy(host);
request.putExtra(Request.PROXY, host);
}
requestBuilder.setConfig(requestConfigBuilder.build());
return requestBuilder.build();
}
protected RequestBuilder selectRequestMethod(Request request) {
String method = request.getMethod();
if (method == null || method.equalsIgnoreCase(HttpConstant.Method.GET)) {
//default get
return RequestBuilder.get();
} else if (method.equalsIgnoreCase(HttpConstant.Method.POST)) {
RequestBuilder requestBuilder = RequestBuilder.post();
NameValuePair[] nameValuePair = (NameValuePair[]) request.getExtra("nameValuePair");
if (nameValuePair != null && nameValuePair.length > 0) {
requestBuilder.addParameters(nameValuePair);
}
return requestBuilder;
} else if (method.equalsIgnoreCase(HttpConstant.Method.HEAD)) {
return RequestBuilder.head();
} else if (method.equalsIgnoreCase(HttpConstant.Method.PUT)) {
return RequestBuilder.put();
} else if (method.equalsIgnoreCase(HttpConstant.Method.DELETE)) {
return RequestBuilder.delete();
} else if (method.equalsIgnoreCase(HttpConstant.Method.TRACE)) {
return RequestBuilder.trace();
}
throw new IllegalArgumentException("Illegal HTTP Method " + method);
}
public static String testContent;
protected Page handleResponse(Request request, String charset, HttpResponse httpResponse, Task task) throws IOException {
String content = getContent(charset, httpResponse);
testContent = content;
Page page = new Page();
page.setRawText(content);
page.setUrl(new PlainText(request.getUrl()));
page.setRequest(request);
page.setStatusCode(httpResponse.getStatusLine().getStatusCode());
return page;
}
public String getConentForTest(String charset, HttpResponse httpResponse) throws IOException {
return this.getContent(charset, httpResponse);
}
protected String getContent(String charset, HttpResponse httpResponse) throws IOException {
if (charset == null) {
byte[] contentBytes = IOUtils.toByteArray(httpResponse.getEntity().getContent());
String htmlCharset = getHtmlCharset(httpResponse, contentBytes);
if (htmlCharset != null) {
return new String(contentBytes, htmlCharset);
} else {
logger.warn("Charset autodetect failed, use {} as charset. Please specify charset in Site.setCharset()", Charset.defaultCharset());
return new String(contentBytes);
}
} else {
return IOUtils.toString(httpResponse.getEntity().getContent(), charset);
}
}
protected String getHtmlCharset(HttpResponse httpResponse, byte[] contentBytes) throws IOException {
String charset;
// charset
// 1、encoding in http header Content-Type
String value = httpResponse.getEntity().getContentType().getValue();
charset = UrlUtils.getCharset(value);
if (StringUtils.isNotBlank(charset)) {
logger.debug("Auto get charset: {}", charset);
return charset;
}
// use default charset to decode first time
Charset defaultCharset = Charset.defaultCharset();
String content = new String(contentBytes, defaultCharset.name());
// 2、charset in meta
if (StringUtils.isNotEmpty(content)) {
Document document = Jsoup.parse(content);
Elements links = document.select("meta");
for (Element link : links) {
// 2.1、html4.01 <meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
String metaContent = link.attr("content");
String metaCharset = link.attr("charset");
if (metaContent.indexOf("charset") != -1) {
metaContent = metaContent.substring(metaContent.indexOf("charset"), metaContent.length());
charset = metaContent.split("=")[1];
break;
}
// 2.2、html5 <meta charset="UTF-8" />
else if (StringUtils.isNotEmpty(metaCharset)) {
charset = metaCharset;
break;
}
}
}
logger.debug("Auto get charset: {}", charset);
// 3、todo use tools as cpdetector for content decode
return charset;
}
}