关于html爬取数据的文章已经有很多了,我今天主要和大家交流的是如何爬取flash网页的数据。这方面资料相对比较少,主要是html5兴起后现在flash站很少了,不过用于技术研究还是可以尝试一下,这篇文章就主要介绍我爬取数据的整个过程。
以房产透明网为例,该网站的一房一价数据就是通过flash显示,接下来将一步步介绍如何获取对应的数据。
特别声明,本文章仅做相关技术学习交流,数据版权为成都透明网,个人或企业请勿用于商业或非法用途,如该文章有不妥之处请联系本人删除。
我找了一个楼盘用浏览器自带的工具查看,可以看到返回的数据是乱码,如下图。
这个主要是返回的数据格式是application/x-amf,浏览器无法正常解析,接下来就需要用的抓包工具Charles了,这个工具没给钱的话30分钟会关闭,我觉得30分钟也够用了,目前一直忍受着。
1.首先打开Charles
2.打开透明网一房一价页面,点击一个单元后就可以看到请求的数据了
这里面比较重要的几个部分我都截取了一下,最后HOUSEITEMLIST,就我们需要处理的数据了。
3.可以看到通过抓包工具已经可以看到请求的数据了,接下来就需要用java模拟amf的请求。
org.apache.flex.blazeds
flex-messaging-core
4.7.2
org.apache.flex.blazeds
flex-messaging-common
4.7.2
先要引入这两个包,这个请求代码如下,部分参数我设置为******,如果需要测试自行粘贴对应的参数。
public static void main(String[] args) {
try {
URL urlObject = new URL("http://cd.funi.com/messagebroker/amf");
HttpURLConnection urlConnection = (HttpURLConnection) urlObject.openConnection();
urlConnection.setDoOutput(true);
urlConnection.setRequestProperty("Content-type", "application/x-amf;charset=gb2312");
urlConnection.setRequestProperty("Host", "cd.funi.com");
urlConnection.setRequestProperty("Origin", "http://user.funi.com");
urlConnection.setRequestProperty("Referer", "http://user.funi.com/resource/swf/house/FundateClient_www.swf?communityId=DAZXiSEGhWZLhWIrVooMiDNjk4UzP3et1CztbkK1SZrXmBDQfGR%2BAFaCxnPg5MFf&t=20181131/[[DYNAMIC]]/1");
urlConnection.setRequestProperty("Cookie", "pgv_pvi=9961606144; pgv_si=s9152640000; Hm_lvt_77be290eccb6ceb57b524a860b6faadc=1545658648,1545745229,1545917030,1546227366; Hm_lpvt_77be290eccb6ceb57b524a860b6faadc=1546227368");
OutputStream outputStream = urlConnection.getOutputStream();
SerializationContext serializationContext = new SerializationContext();
ActionContext actionContext = new ActionContext();
//构建请求信息(0-amf0 3-amf3)
ActionMessage requestMessage = new ActionMessage();
AmfTrace amfTrace = new AmfTrace();
RemotingMessage remotingMessage = new RemotingMessage();
remotingMessage.setOperation("***********************************");
remotingMessage.setSource(null);
remotingMessage.setClientId("FF66DFC9-B00D-2C39-E122-6B6752416543");
remotingMessage.setDestination("dEEDOCService");
remotingMessage.setMessageId("******************************");
remotingMessage.setHeader("DSEndpoint", "my-amf");
remotingMessage.setHeader("DSId", "*************************");
remotingMessage.setTimeToLive(0);
remotingMessage.setTimestamp(0);
remotingMessage.setBody(new Object[]{"kezlmwCvdjGPckPbY1SmeL3frogB2sfc7IgjBssaFJ2ihf5M93DgMgf5mIqLiWgMNvNwBsVQKuDfTympu4bAjLV9/3mGEHK+MfNqVZKTY0xC3uGOkDg+i2Pt9oTDxBm1xU5Cvmjmd/9mXzN/v3UOvSoqKlLNYy42g8uGAq+JFczhHpdRi7LBtP56E8OJaGq4VksJJnPhGLtMLt1T3wZZKzcV4MqJ2U7NTg7q5AmyCC89nvetx/5Gop8mUBe0tHQdSop8mhHerHn+n7y5O1BL3sRS8T3e1B9F2txtWzcNX0NBzDgAMpfa3AJAhaZ7yuhwd5VtLYD+KquXCUmxJAd/YSjjZGAYYomWjZqRMfO5x5cP/SH8AeI4BiKbTQ+2UygOvYCiTAzy+8GNG0oKpTDCnP2/j2CFhISaMutwAFTF7CZw6HCzJq+2iA8sVnNmCePQMieuZOyq7LG0PppzHRkQYGpUzGynN4FJ8Dz7TBXmuKu7bWJ7jlrYdHbsexEGhoI2fEh/hivzSuCaBfWojChwMQOrtiYKG/YYEgtxNmEUYVdDH5XUiFHVH0V3W+O16fluHZUoaJdvZ+Fbm9oJIB2cz1X9hQSOcs3Cc7i95hhJ0SdQGa1yMw7c2vJSWzbTKuc6rnFm8IDmR6qm6sEIUHRokN56IsDqS+ZHaXWNoOG4q0xR97tFCPlrURWxLcJX3tIJ4xl/imVVlifcAZX4/gXkykAGpM7tdGOy0J/hegAZqCY="});
MessageBody amfMessage = new MessageBody(null, "/3", new Object[]{remotingMessage});
requestMessage.addBody(amfMessage);
// Setup for AMF message serializer
actionContext.setRequestMessage(requestMessage);
ByteArrayOutputStream outBuffer = new ByteArrayOutputStream();
AmfMessageSerializer amfMessageSerializer = new AmfMessageSerializer();
amfMessageSerializer.initialize(serializationContext, outBuffer, amfTrace);
amfMessageSerializer.writeMessage(requestMessage);
outBuffer.writeTo(outputStream);
outBuffer.flush();
outBuffer.close();
InputStream inputStream = urlConnection.getInputStream();
BufferedInputStream urlConnectionInputStream = new BufferedInputStream(inputStream);
serializationContext = new SerializationContext();
actionContext = new ActionContext();
ActionMessage message = new ActionMessage();
actionContext.setRequestMessage(message);
ClassAliasRegistry.getRegistry().registerAlias("DSK", "com.funi.frontend.dto.HouseTable");//需在项目中设置对应的类包名需一致
MessageDeserializer deserializer = new AmfMessageDeserializer();
deserializer.initialize(serializationContext, urlConnectionInputStream, amfTrace);
deserializer.readMessage(message, actionContext);
Object result = null;
for (MessageBody msg : (ArrayList) message.getBodies()) {
java.lang.String targetURI = msg.getTargetURI();
if (targetURI.endsWith(MessageIOConstants.RESULT_METHOD)) {
result = msg.getData();
AcknowledgeMessage acknowledgeMessage=(AcknowledgeMessage)result;
Object body = acknowledgeMessage.getBody();
ASObject asObject=(ASObject)body;
ArrayCollection houseitemlist =(ArrayCollection) asObject.get("HOUSEITEMLIST");
for (Object o : houseitemlist) {
HouseTable houseTable=(HouseTable)o;
System.out.println(DecodeUtils.decode(houseTable.getUnitNo()));
System.out.println(DecodeUtils.decode(houseTable.getUsage()));
System.out.println(DecodeUtils.decode(houseTable.getTotalArea()));
}
} else if (targetURI.endsWith(MessageIOConstants.STATUS_METHOD)) {
java.lang.String exMessage = "Server error";
result = exMessage;
}
}
} catch (Exception e) {
System.out.print("error");
}
}
packagecom.funi.frontend.dto;public classHouseTable {privateBoolean isMortgage;privateString status;privateString roomNo;privateString listWaterPrice;privateString typeHouse;privateString huxId;privateString buildingNo;privateString fitmentPrice;privateString floorNo;privateString listPrice;privateBoolean isSealUp;privateString usage;privateString totalArea;privateObject houseTableList;privateObject phase;privateString unitNo;privateString buildingId;privateString communityId;publicBoolean getMortgage() {returnisMortgage;
}public voidsetMortgage(Boolean mortgage) {
isMortgage=mortgage;
}publicString getStatus() {returnstatus;
}public voidsetStatus(String status) {this.status =status;
}publicString getRoomNo() {returnroomNo;
}public voidsetRoomNo(String roomNo) {this.roomNo =roomNo;
}publicString getListWaterPrice() {returnlistWaterPrice;
}public voidsetListWaterPrice(String listWaterPrice) {this.listWaterPrice =listWaterPrice;
}publicString getTypeHouse() {returntypeHouse;
}public voidsetTypeHouse(String typeHouse) {this.typeHouse =typeHouse;
}publicString getHuxId() {returnhuxId;
}public voidsetHuxId(String huxId) {this.huxId =huxId;
}publicString getBuildingNo() {returnbuildingNo;
}public voidsetBuildingNo(String buildingNo) {this.buildingNo =buildingNo;
}publicString getFitmentPrice() {returnfitmentPrice;
}public voidsetFitmentPrice(String fitmentPrice) {this.fitmentPrice =fitmentPrice;
}publicString getFloorNo() {returnfloorNo;
}public voidsetFloorNo(String floorNo) {this.floorNo =floorNo;
}publicString getListPrice() {returnlistPrice;
}public voidsetListPrice(String listPrice) {this.listPrice =listPrice;
}publicBoolean getSealUp() {returnisSealUp;
}public voidsetSealUp(Boolean sealUp) {
isSealUp=sealUp;
}publicString getUsage() {returnusage;
}public voidsetUsage(String usage) {this.usage =usage;
}publicString getTotalArea() {returntotalArea;
}public voidsetTotalArea(String totalArea) {this.totalArea =totalArea;
}publicObject getHouseTableList() {returnhouseTableList;
}public voidsetHouseTableList(Object houseTableList) {this.houseTableList =houseTableList;
}publicObject getPhase() {returnphase;
}public voidsetPhase(Object phase) {this.phase =phase;
}publicString getUnitNo() {returnunitNo;
}public voidsetUnitNo(String unitNo) {this.unitNo =unitNo;
}publicString getBuildingId() {returnbuildingId;
}public voidsetBuildingId(String buildingId) {this.buildingId =buildingId;
}publicString getCommunityId() {returncommunityId;
}public voidsetCommunityId(String communityId) {this.communityId =communityId;
}
}
最后获取到对应数据后用base64解密一下即可。
特别声明,本文章仅做相关技术学习交流,数据版权为成都透明网,个人或企业请勿用于商业或非法用途,如该文章有不妥之处请联系本人删除。
喜欢java开发的可以加我qq3369245209,后面会建立一个java开发高级群,下期将介绍如何爬取app数据。