目录
亚马逊中国站获取全部商品分类
亚马逊中国站获取商品列表
亚马逊中国站通过ASIN获取商品信息
亚马逊中国站获取商品库存信息
亚马逊国际站获取全部商品分类
亚马逊国际站获取商品列表
亚马逊国际站处理图形验证码
亚马逊国际站通过ASIN获取商品信息
亚马逊国际站获取商品库存信息
所提供代码已经为可运行代码,但亚马逊响应数据随时会变,造成解析异常。如果使用期间遇到问题,欢迎随时沟通。可扫描下方二维码公众号留言。
出现验证码情形
一般来说,如果相同ip请求过于频繁,就会出现图形验证码,如果使用不同的user-agent来请求,也可以降低出现验证码的频率,但是不能完全避免。
处理方式测试通过的有两种方式。
方式一:使用网络代理,比如10分钟内有效的代理,每十分钟更换一次,基本上能处理掉验证码的问题。如果还有就5分钟更换一次。该方式较为简单不再提供代码。
方式二:识别出图形验证码并携带cookie重新请求。需要注意的是,识别后进入的页面不一定是出现验证码之前既定进入的,所以最好是携带cookie重新请求一次。
图形验证码
市面上有多种图形验证码识别软件,之前使用过一款exe直接启动就能识别的,识别亚马逊的验证码一点问题也没有。不过鉴于对系统限制必须用windows,所以暂时不考虑了。
本篇使用的是图鉴(http://ttshitu.com/),识别效果不做保证,主要是便宜。
测试说明
测试请求商品列表信息的前面100页,一般来说,一个从来没有请求过的ip连续调用五六十次也会出现验证码。
Java代码
已经替换了验证码识别网站的账号密码,使用前需自行替换。
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileWriter;
import java.io.InputStream;
import java.util.HashMap;
import java.util.Map;
import javax.imageio.stream.FileImageOutputStream;
import org.apache.http.HttpResponse;
import org.apache.http.client.CookieStore;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import com.alibaba.fastjson.JSONObject;
public class AmazonTest7 {
public static void main(String[] args) throws Exception {
CookieStore store = new BasicCookieStore();
CloseableHttpClient httpclient = HttpClients.custom().setDefaultCookieStore(store).build();
String url = "https://www.amazon.com/-/zh/s?bbn=16225009011&rh=i%3Aspecialty-aps%2Cn%3A%2116225009011%2Cn%3A281407&ref_=nav_em__nav_desktop_sa_intl_accessories_and_supplies_0_2_5_2";
for (int i = 1; i <= 100; i++) {
System.out.println(url);
url = printInfo(url, httpclient);
}
}
static int i = 1;
// 返回的是下一页的url
static String printInfo(String url, CloseableHttpClient httpclient) throws Exception {
HttpGet get = new HttpGet(url);
get.addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");
get.addHeader("user-agent",
"Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/81.0.4044.138Safari/537.36");
CloseableHttpResponse rese = httpclient.execute(get);
String redsa = EntityUtils.toString(rese.getEntity());
Document doc = Jsoup.parse(redsa);
if (doc.toString().contains("Type the characters you see in this image:")) { // 有验证码
while(true) {
String code = checkCode(doc);
System.out.println(code);
Element eleForm = doc.getElementsByTag("form").first();
String amzn = eleForm.getElementsByTag("input").first().val();
String amznr = eleForm.getElementsByTag("input").get(1).val();
String url2 = "https://www.amazon.com" + eleForm.attr("action") + "?amzn=" + amzn + "&amzn-r=" + amznr
+ "&field-keywords=" + code;
get = new HttpGet(url2);
rese = httpclient.execute(get);
redsa = EntityUtils.toString(rese.getEntity());
doc = Jsoup.parse(redsa);
if(!redsa.contains("Type the characters you see in this image:")) {
break;
}
}
get = new HttpGet(url);
get.addHeader("accept-language", "zh-CN,zh;q=0.9,en;q=0.8");
get.addHeader("user-agent",
"Mozilla/5.0(Macintosh;IntelMacOSX10_13_4)AppleWebKit/537.36(KHTML,likeGecko)Chrome/81.0.4044.138Safari/537.36");
rese = httpclient.execute(get);
redsa = EntityUtils.toString(rese.getEntity());
doc = Jsoup.parse(redsa);
}
Elements goodsEles = doc.getElementsByClass("sg-col-4-of-12 s-result-item s-asin sg-col-4-of-16 sg-col s-widget-spacing-small sg-col-4-of-20");
for (Element goodsEle : goodsEles) {
System.out.println("商品" + i++);
String detailUrl = "https://www.amazon.cn" + goodsEle.getElementsByTag("a").first().attr("href");
System.out.println("商品详情:" + detailUrl);
String asin = goodsEle.attr("data-asin");
System.out.println("ASIN:" + asin);
String uuid = goodsEle.attr("data-uuid");
System.out.println("UUID:" + uuid);
String img = goodsEle.getElementsByTag("img").first().attr("src");
System.out.println("封面图片:" + img);
String name = goodsEle.getElementsByTag("h2").first().text();
System.out.println("名称:" + name);
Element starEle = goodsEle.getElementsByClass("a-icon-alt").first();
if (starEle != null) {
String star = starEle.text();
System.out.println("评分:" + star);
String count = goodsEle.getElementsByClass("a-section a-spacing-none a-spacing-top-micro").first()
.getElementsByClass("a-size-base").first().text().replaceAll(",", "");
System.out.println("评价人数:" + count);
} else {
System.out.println("暂无评分");
System.out.println("评价人数:0");
}
Element priceEle = goodsEle.getElementsByClass("a-offscreen").first();
if (priceEle != null) {
String price = priceEle.text().replaceAll(",", "");
System.out.println("价格:" + price);
} else {
System.out.println("价格:列表未显示价格,可能无货");
}
System.out.println("\n===================================\n");
}
FileWriter fw = new FileWriter("/Users/admin/Desktop/ac.html", false);
BufferedWriter bw = new BufferedWriter(fw);
bw.newLine();
bw.write(doc.toString());
bw.close();
fw.close();
String nextUrl = "https://www.amazon.com"
+ doc.getElementsByClass("a-last").first().getElementsByTag("a").first().attr("href");
return nextUrl;
}
private static String checkCode(Document doc) throws Exception {
String img = doc.getElementsByClass("a-row a-text-center").first().getElementsByTag("img").first().attr("src");
System.out.println(img);
HttpGet get = new HttpGet(img);
CloseableHttpClient httpclient = HttpClients.custom().build();
HttpResponse response = httpclient.execute(get);
byte[] data1 = EntityUtils.toByteArray(response.getEntity());
FileImageOutputStream imageOutput = new FileImageOutputStream(new File("amazonCode.jpg"));
imageOutput.write(data1, 0, data1.length);
imageOutput.close();
return readCode();
}
private static String readCode() throws Exception {
String imgCode = "";
String username = "【替换为用户名】";
String password = "【替换为密码】";
InputStream inputStream = null;
File needRecoImage = new File("amazonCode.jpg");
inputStream = new FileInputStream(needRecoImage);
Map<String, String> data = new HashMap<>();
data.put("username", username);
data.put("password", password);
data.put("typeid", "1002");
String resultString = Jsoup.connect("http://api.ttshitu.com/create.json").data(data)
.data("image", "amazonCode.jpg", inputStream).ignoreContentType(true).post().text();
Map<String, Object> map = JSONObject.parseObject(resultString);
if ((Boolean) map.get("success")) {
Map<String, Object> map1 = JSONObject.parseObject(String.valueOf(map.get("data")));
imgCode = (String) map1.get("result");
}
return imgCode;
}
}