- 以美国adidas官网为例。
- 输入url,抓取商品信息(标题、描述、图片等);抓取属性信息(颜色、尺码、价格、库存、skuId)。
- 思路很简单,就是打开页面,分析各个需要内容的标签。
获取页面
public static Document getHttpPostResponseWithDocument(String url, String referrer, List<NameValuePair> params, DecompressingHttpClient httpClient) throws IOException {
HttpResponse response = getHttpPostResponse(url, referrer, params, httpClient);
Document doc = Jsoup.parse(EntityUtils.toString(response.getEntity(), "UTF-8"));
EntityUtils.consume(response.getEntity());
return doc;
}
public static HttpResponse getHttpGetResponse(String url, String referrer, DecompressingHttpClient httpClient) throws IOException {
HttpGet get = new HttpGet(url);
setHeaders(get);
if (!StringUtils.isBlank(referrer)) {
get.setHeader("Referer", referrer);
}
return httpClient.execute(get);
}
判断是否有货
public boolean isInStock() {
Elements addToCartElements = doc.select(".addtocart");
if(null == addToCartElements || addToCartElements.isEmpty()) {
return false;
}
if(!addToCartElements.toString().contains("add-to-cart-button")) {
return false;
}
return true;
}
颜色获取
public ExecInfo parse(String url, Map<String, String> colorMap) {
ExecResult<Document> execResult = getOneSkuInfoPage(url)
if (!execResult.isSucc()) {
LogUtils.info(execResult.getMsg())
}
if(!isInStock()) {
LogUtils.info("out of stock!")
return ExecInfo.fail("out of stock!")
}
Elements curColorElements = doc.select(".product-color")
if(null == curColorElements || curColorElements.isEmpty()) {
return ExecInfo.fail("获取当前商品颜色信息失败")
} else {
Pattern COLOR_PATTERN = Pattern.compile("<span class=\"product-color-clear\">([^<]*)</span>")
Pattern SKU_PATTERN = Pattern.compile("\\(([0-9A-Za-z]*)\\)")
Matcher color_matcher = COLOR_PATTERN.matcher(curColorElements.toString())
Matcher sku_matcher = SKU_PATTERN.matcher(curColorElements.toString())
if(color_matcher.find() && sku_matcher.find()) {
LogUtils.info("CURRENT COLOR: " + sku_matcher.group(1) + ", " + color_matcher.group(1))
}
}
//Elements elements = doc.select("#colorVariationsCarousel")
Elements elements = doc.select(".color-variation-row")
if(null != elements && !elements.isEmpty()) {
for (Element element : elements) {
Elements colorElements = element.select(".color-variations-thumb-color")
for (Element colorElement : colorElements) {
//LogUtils.info(colorElement.toString())
Pattern SKU_PATTERN = Pattern.compile("data-articleno=\"([0-9A-Za-z]*)")
Pattern TITLE_PATTERN = Pattern.compile("title=\"([^\"]*)")
Matcher sku_matcher = SKU_PATTERN.matcher(colorElement.toString())
Matcher title_matcher = TITLE_PATTERN.matcher(colorElement.toString())
if (sku_matcher.find() && title_matcher.find()) {
colorMap.put(sku_matcher.group(1), title_matcher.group(1))
}
}
}
}
LogUtils.info(colorMap.toString())
return ExecInfo.succ()
}