微医网爬虫(一) java实现

9 篇文章 1 订阅
7 篇文章 0 订阅

爬取微医网医生的基本数据,获取每个医生的URL之后,可以使用以下方法解析:

想要采集医生历史问诊详细信息的同学可以移步我们另一篇博客:传送门

public Doctor getDoctorInfor(String url) {
        Doctor doctor = new Doctor();
        //提取id
        doctor.setId(url.substring(30, url.length()));
        System.out.print("正在获取:" + doctor.getId() + "\t");
        Document doc = null;
        try {
            doc = Jsoup.connect(url).get();
            if (doc != null) {
                Element ele1 = doc.selectFirst("div[class=detail word-break]");
                if (ele1 != null) {
                    //提取姓名与职称
                    Element h1 = ele1.selectFirst("h1");
                    if (h1 != null) {
                        Element s = h1.selectFirst("Strong");
                        Element sp = h1.selectFirst("span");
                        if (s != null) {
                            doctor.setName(s.text());
                            System.out.print(s.text() + "\t");
                        }
                        if (sp != null) {
                            doctor.setJob(sp.text());
                        }
                    }
                    //提取是否为专家
                    Element isA = ele1.selectFirst("a[class=expert-group]");
                    if (isA != null) {
                        doctor.setIsExpert("1");
                    } else {
                        doctor.setIsExpert("0");
                    }
                    //提取医院和科室
                    Element hosDiv = ele1.selectFirst("div[class=hospital]");
                    if (hosDiv != null) {
                        Element a1 = hosDiv.selectFirst("a");
                        Element a2 = hosDiv.select("a").get(1);
                        if (a1 != null) {
                            doctor.setHospital(a1.text());
                        }
                        if (a2 != null) {
                            doctor.setRoom(a2.text());
                        }
                    }
                    //提取擅长领域
                    Element goodDiv = ele1.selectFirst("div[class=goodat]");
                    if (goodDiv != null) {
                        Element span1 = goodDiv.selectFirst("span");
                        if (span1 != null) {
                            doctor.setGoodAt(span1.text());
                        }
                    }
                    //提取简介
                    Element aboutDiv = ele1.selectFirst("div[class=about]");
                    if (aboutDiv != null) {
                        Element a = aboutDiv.selectFirst("a");
                        if (a != null) {
                            doctor.setSummary(a.attr("data-description"));
                        } else {
                            Element span = aboutDiv.selectFirst("span");
                            if (span != null) {
                                doctor.setSummary(span.text());
                            }
                        }

                    }

                }
                //提取评分,问诊量,预约量
                Element ele2 = doc.selectFirst("div[class=status]");
                if (ele2 != null) {
                    Element dataDiv = ele2.selectFirst("div[class=data]");
                    if (dataDiv != null) {
                        Elements strong = dataDiv.select("strong");
                        if (strong.size() == 3) {
                            doctor.setMarks(strong.get(0).text());
                            doctor.setApoint(strong.get(1).text());
                            doctor.setAsk(strong.get(2).text());
                        }
                    }
                }
                //提取关注量
                Element markDiv = doc.selectFirst("div[class=summary]");
                if (markDiv != null) {
                    Element markspan = markDiv.selectFirst("span[class=mark-count]");
                    if (markspan != null) {
                        doctor.setFocus(markspan.text());
                    }
                }

                //提取价格
                Element price = doc.selectFirst("div[class=consult-type]");
                if (price != null) {
                    Elements pr = price.select("p[class=current-price]");
                    if (pr.size() >= 1) {
                        String pr1 = pr.get(0).text();
                        doctor.setPrice1(pr1.substring(1, pr1.length()));

                    }
                    if (pr.size() == 2) {
                        String pr2 = pr.get(1).text();
                        doctor.setPrice2(pr2.substring(1, pr2.length()));
                    }
                }
                //提取评论数量
                Element commentDiv = doc.selectFirst("section[class=grid-section grid-section-outside expert-comment]");
                if (commentDiv != null) {
                    Element tip = commentDiv.selectFirst("div[class=tip]");
                    if (tip != null) {
                        Element st = tip.selectFirst("strong");
                        if (st != null) {
                            doctor.setComment(st.text());
                        }
                    }
                }
                //提取问诊和回答的数量
                Element ele3 = doc.selectFirst("section[class=grid-section grid-section-outside expert-history-ask J_ExpertHistoryAsk]");
                if (ele3 != null) {
                    Element a = ele3.selectFirst("a[class=tip]");
                    if (a != null) {
                        String aurl = a.attr("href");
                        //getSomNum(aurl, doctor);
                       // getSomNumb(aurl,doctor);    //TODO:需要问诊数量和回答数量时将此行代码恢复即可
                    }

                }
                //提取状态信息
                Element  status=doc.selectFirst("div[id=service]");
                if(status!=null){
                    Elements as=status.select("a");
                        doctor.setIsGhuahao("0");

                        doctor.setIsTuwen("0");

                        doctor.setIsShihua("0");

                        doctor.setIsFuwu("0");
                    for(Element elemente:as){
                        String attr=elemente.attr("class");
                        if(attr.contains("guahao")){
                            doctor.setIsGhuahao(isActive(attr));

                        }
                        if(attr.contains("tuwen")){
                            doctor.setIsTuwen(isActive(attr));
                        }
                        if(attr.contains("shipin")){
                            doctor.setIsShihua(isActive(attr));

                        }
                        if(attr.contains("servicePkg")){
                            doctor.setIsTuwen(isActive(attr));
                        }

                    }
                }

            }

        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("---->完成");
        return doctor;
    }

需要源码的同学可以联系博主QQ(1477517404)。爬取结果:



  • 1
    点赞
  • 4
    收藏
    觉得还不错? 一键收藏
  • 1
    评论
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值