java httpclient 502_Java爬虫(HttpClient)

网络爬虫主要功能就是对网页内容进行爬取,然后根据特定需求对内容进行过滤分析。

针对网页内容爬取,假设需求为要对一个网站进行全站爬取,将爬取的文件按类型保存在本地磁盘,并提供配置网站爬取的最大层次、最大链接数、爬取类型范围等。

这里使用kafka主题做爬虫队列,使用springboot做了一个简单的实现。

任务创建接口

这里提供了两个接口,一个是输入网站爬取配置创建爬取任务,一个是根据任务id查询任务状态,没有提供详细结果查询,结果可直接数据库查看。

@RestController

public class CrawlerTaskController {

@Autowired

private WebsiteTaskService websiteTaskService;

@Autowired

private WebsiteTaskDao websiteTaskDao;

@Autowired

private TaskProducer taskProducer;

@PostMapping("task/add")

@ResponseBody

public Map addWebsiteTask(WebsiteTask item) {

item.setTaskCount(1);

websiteTaskService.put(item);

UrlTask task = new UrlTask();

task.setUrl(item.getUrl());

task.setParentId(-1);

task.setRootId(task.getRootId());

task.setLevel(0);

taskProducer.sendUrlTask(task);

Map map = new HashMap<>();

map.put("id", item.getId());

map.put("message", "爬虫任务添加成功!");

return map;

}

@PostMapping("task/get")

@ResponseBody

public WebsiteTask getWebsiteTask(int id) {

return websiteTaskDao.findById(id).get();

}

}

网站任务实体

@Entity

@EntityListeners(AuditingEntityListener.class)

public class WebsiteTask {

@Id

@GeneratedValue(strategy = GenerationType.IDENTITY)

private int id;

@Column(length = 1024)

private String url;//网站url,一般为首页链接

private int maxLevel;//最大爬取层次

private int maxCount;//最大爬取链接数

private int outerLevel;//最大爬取外链层次

private String range;//爬取类型范围

private int taskCount;//任务数

private int finishCount;//爬取完成任务数

private int state = 1;//状态:1=执行中;2=已完成

@CreatedDate

private Date createTime;//创建时间

private Date finishTime;//完成时间

@Transient

private List ranges;

......

}

url任务实体

@Entity

@EntityListeners(AuditingEntityListener.class)

public class UrlTask {

@Id

@GeneratedValue(strategy = GenerationType.IDENTITY)

private int id;

private int parentId;//父页面任务id

private int rootId;//网站任务id

@Column(length = 1024)

private String url;

private String contentType;//网页类型

private long contentLength;//内容长度

private int level;//当前层次

private long useMillis;//爬取用时

private int respCode;//响应状态码

private String remark;//备注

private String filePath;//保存磁盘文件路径

@CreatedDate

private Date createTime;//创建时间

......

}

爬取的网页模型定义

public class WebPageModel {

public int respCode = 200;//响应状态码

public String message;//异常信息

public Document document;//html页面Document对象

public String encoding;//页面编码

public String contentType;//网页类型

public long contentLength;//内容长度

public String filePath;//文件路径

public String fileExt;//文件后缀

public PageFormat format = PageFormat.OTHER;

public enum PageFormat {

HTML, IMAGE, AUDIO, VIDEO, TXT, WORD, EXCEL, PPT, PDF, COMPRESS, APK, IPA, OTHER

}

public void updateFormat() {

String type = contentType;

if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(contentType)) {

type = ContentTypeUtil.getContentType(fileExt);

}

if (ContentTypeUtil.isHtml(type)) {

format = PageFormat.HTML;

} else if (ContentTypeUtil.isImage(type)) {

format = PageFormat.IMAGE;

} else if (ContentTypeUtil.isAudio(type)) {

format = PageFormat.AUDIO;

} else if (ContentTypeUtil.isVideo(type)) {

format = PageFormat.VIDEO;

} else if (ContentTypeUtil.isTxt(type)) {

format = PageFormat.TXT;

} else if (ContentTypeUtil.isWord(type)) {

format = PageFormat.WORD;

} else if (ContentTypeUtil.isExcel(type)) {

format = PageFormat.EXCEL;

} else if (ContentTypeUtil.isPpt(type)) {

format = PageFormat.PPT;

} else if (ContentTypeUtil.isPdf(type)) {

format = PageFormat.PDF;

} else if (ContentTypeUtil.isCompress(type)) {

format = PageFormat.COMPRESS;

} else if (ContentTypeUtil.isApk(type)) {

format = PageFormat.APK;

} else if (ContentTypeUtil.isIpa(type)) {

format = PageFormat.IPA;

}

}

}

url去重

public class UrlDuplicateFilter {

private final Object lock = new Object();

private final Set set = new HashSet<>();

private final int maxCount; // 最大不相同数量

public UrlDuplicateFilter(int maxCount) {

this.maxCount = maxCount;

}

/**

* 过滤重复url

*/

public boolean filter(String url) {

if (StringUtils.isBlank(url)) {

return false;

}

synchronized (lock) {

if (reachMaxCount() || set.contains(url)) {

return false;

}

set.add(url);

}

return true;

}

/**

* 判断数量是否达到上限

*/

private boolean reachMaxCount() {

return set.size() >= maxCount;

}

}

一条url的爬取执行过程

public class CrawlerTask implements Runnable {

private static final Logger LOG = LoggerFactory.getLogger(CrawlerTask.class);

private UrlTask task;

private WebsiteTaskService websiteTaskService;

private UrlTaskDao urlTaskDao;

private TaskProducer taskProducer;

public CrawlerTask(UrlTask task, WebsiteTaskService websiteTaskService, UrlTaskDao urlTaskDao,

TaskProducer taskProducer) {

this.task = task;

this.websiteTaskService = websiteTaskService;

this.urlTaskDao = urlTaskDao;

this.taskProducer = taskProducer;

}

@Override

public void run() {

long millis = System.currentTimeMillis();

WebsiteTask website = websiteTaskService.getWebsiteTask(task.getRootId());

//爬取链接内容

WebPageModel page = PageDownloadUtil.executeGet(task.getUrl(), website.getRanges());

task.setContentLength(page.contentLength);

task.setContentType(page.contentType);

task.setRespCode(page.respCode);

task.setRemark(page.message);

task.setFilePath(page.filePath);

task.setUseMillis(System.currentTimeMillis() - millis);

urlTaskDao.saveAndFlush(task);

if (task.getLevel() < website.getMaxLevel()) {

Set childUrls = new UrlExtract(page.document, task.getUrl()).extractFromA().extractFromFrame()

.extractFromIframe().extractFromImg().getUrls();

if (!childUrls.isEmpty()) {

UrlDuplicateFilter dupFilter = websiteTaskService.getUrlDuplicateFilter(task.getRootId());

int addCount = 0;

for (String childUrl : childUrls) {

if (CrawlerUtil.isOuterUrl(task.getUrl(), childUrl) && task.getLevel() >= website.getOuterLevel()) {

continue;

}

//提取出的子链接去重

if (dupFilter.filter(childUrl)) {

UrlTask childTask = new UrlTask();

childTask.setUrl(childUrl);

childTask.setParentId(task.getId());

childTask.setRootId(task.getRootId());

childTask.setLevel(task.getLevel() + 1);

taskProducer.sendUrlTask(childTask);

addCount++;

}

}

//任务数更新

websiteTaskService.addTaskCount(task.getRootId(), addCount);

}

}

//完成任务数更新

websiteTaskService.addFinishCount(task.getRootId());

LOG.info(String.format("爬取用时=%s,url=%s", System.currentTimeMillis() - millis, task.getUrl()));

}

}

网页爬取工具

public class PageDownloadUtil {

private static final Logger LOG = LoggerFactory.getLogger(PageDownloadUtil.class);

private static final int MAX_HTML_LENGTH = 20 * 1024 * 1024;//html页面限制20M

private static final int MAX_FILE_LENGTH = 500 * 1024 * 1024;//其它附件类型限制500M

private static final String FOLDER_NAME = "d:/temp/" + UUID.randomUUID().toString().replace("-", "") + "/";

private static final AtomicInteger INDEX = new AtomicInteger();

private static final CloseableHttpClient client = HttpClientUtil.createHttpClient();

//自定义错误返回值

private static final Map CODE_MAP = new HashMap<>();

static{

CODE_MAP.put(-501,"uri解析异常");

CODE_MAP.put(-502,"网络协议异常");

CODE_MAP.put(-503,"域名解析异常");

CODE_MAP.put(-504,"http连接异常");

CODE_MAP.put(-505,"网络IO异常");

CODE_MAP.put(-506,"页面解析异常");

CODE_MAP.put(-507,"编码格式异常");

CODE_MAP.put(-508,"内容长度超出限制");

CODE_MAP.put(-509,"网页类型超出可爬取范围");

}

public static WebPageModel executeGet(String url, List ranges) {

WebPageModel page = new WebPageModel();

int redirectTimes = 0;

boolean redirect;

URI uri = CrawlerUtil.urlConvertToUri(url);

if (uri == null) {

page.respCode = -501;

page.message = CODE_MAP.get(page.respCode);

return page;

}

do {

redirectTimes++;

redirect = false;

HttpGet method = new HttpGet(uri);

HttpClientUtil.setHeader(method, url);

CloseableHttpResponse response = null;

long millis = System.currentTimeMillis();

try {

response = client.execute(method);

page.respCode = response.getStatusLine().getStatusCode();

if (page.respCode == HttpStatus.SC_OK) {

download(page, url, response, ranges);

} else if (page.respCode >= 300 && page.respCode < 400) {// 页面跳转

Header[] locationHeader = response.getHeaders("location");

if (locationHeader != null && locationHeader.length > 0) {

String redirectUrl = locationHeader[0].getValue();

if (StringUtils.isNotBlank(redirectUrl) && !url.equals(redirectUrl)) {

uri = CrawlerUtil.urlConvertToUri(redirectUrl);

redirect = true;

}

}

}

} catch (ClientProtocolException e) {

LOG.error("", e);

page.respCode = -502;

page.message = CODE_MAP.get(page.respCode);

} catch (UnknownHostException e) {

LOG.error("", e);

page.respCode = -503;

page.message = CODE_MAP.get(page.respCode);

} catch (HttpHostConnectException e) {

LOG.error("", e);

page.respCode = -504;

page.message = CODE_MAP.get(page.respCode);

} catch (IOException e) {//连接超时尝试重连3次

redirectTimes++;

redirect = true;

LOG.error(String.format("第%s次链接失败,executeusetime=%s", redirectTimes / 2,

System.currentTimeMillis() - millis), e);

page.respCode = -505;

page.message = CODE_MAP.get(page.respCode);

} finally {

if (response != null) {

EntityUtils.consumeQuietly(response.getEntity());

try {

response.close();

} catch (IOException e) {

LOG.error("responseclose", e);

}

}

method.releaseConnection();

}

} while (redirect && redirectTimes <= 5);

return page;

}

private static void download(WebPageModel page, String url, CloseableHttpResponse response,

List ranges) {

HttpEntity entity = response.getEntity();

page.contentLength = entity.getContentLength();// 此方法不准确,经常返回-1,后面重新赋值

// ContentType.getOrDefault(entity).getMimeType()提取可能会因为非支持的charset类型而报错,所以这里改为手工提取mimeType

Header header = entity.getContentType();

if (header != null) {

HeaderElement[] headerElements = header.getElements();

if (headerElements != null && headerElements.length > 0) {

page.contentType = headerElements[0].getName();

}

}

if (ContentTypeUtil.OCTET_STREAM_TYPE.equalsIgnoreCase(page.contentType)) {

page.fileExt = HttpClientUtil.getOctetStreamFileExt(url, response);

} else if (page.contentType == null) {

//若未从header中取到contentType,根据url后缀判断

if (url.lastIndexOf("/") > 8) {

String name = url.substring(url.lastIndexOf("/"));

if (name.contains(".")) {

page.contentType = ContentTypeUtil.getContentType(name.substring(name.lastIndexOf(".")));

}

}

}

page.updateFormat();

if (ranges.contains(page.format.toString())) {

if (page.format == WebPageModel.PageFormat.HTML) {

if (page.contentLength == 0 || page.contentLength > MAX_HTML_LENGTH) {

page.respCode = -508;

page.message = CODE_MAP.get(page.respCode);

return;

}

try {

String html = null;

Document document = null;

String charset = null;

if (header != null) {

charset = CrawlerUtil.judgeCharset(header.toString());

}

if (charset != null) {

html = EntityUtils.toString(entity, charset);

document = Jsoup.parse(html);

} else {

byte[] data = EntityUtils.toByteArray(entity);

html = new String(data, CrawlerUtil.UTF_8);

document = Jsoup.parse(html);

charset = CrawlerUtil.getCharsetFromMeta(document);

if (charset != null && !CrawlerUtil.UTF_8.equals(charset)) {

html = new String(data, charset);

document = Jsoup.parse(html);

}

}

byte[] data = html.getBytes(CrawlerUtil.UTF_8);

page.contentLength = data.length;

if (page.contentLength <= 0 || page.contentLength > MAX_HTML_LENGTH) {

page.respCode = -508;

page.message = CODE_MAP.get(page.respCode);

return;

}

page.encoding = CrawlerUtil.UTF_8;

page.document = document;

createFilePath(page);

HttpClientUtil.exportDataAsFile(data, page.filePath);

} catch (ParseException e) {

LOG.error("", e);

page.respCode = -506;

page.message = CODE_MAP.get(page.respCode);

} catch (UnsupportedEncodingException e) {

LOG.error("", e);

page.respCode = -507;

page.message = CODE_MAP.get(page.respCode);

} catch (IOException e) {

LOG.error("", e);

page.respCode = -505;

page.message = CODE_MAP.get(page.respCode);

}

} else {// 如果是非html页面直接下载

if (page.contentLength == 0 || page.contentLength > MAX_FILE_LENGTH) {

page.respCode = -508;

page.message = CODE_MAP.get(page.respCode);

return;

}

if (page.fileExt == null) {

page.fileExt = ContentTypeUtil.getExtendFileName(page.contentType);

}

createFilePath(page);

HttpClientUtil.exportEntityAsFile(entity, page.filePath);

}

} else {

page.respCode = -509;

page.message = CODE_MAP.get(page.respCode);

}

}

private static void createFilePath(WebPageModel page) {

String filePath = FOLDER_NAME + page.format.toString() + "/";

File file = new File(filePath);

file.mkdirs();

page.filePath = filePath + INDEX.getAndIncrement() + page.fileExt;

}

}

项目地址

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值