爬虫-链家二手房

爬虫-链家二手房

import ssl, re,json
from urllib.request import urlopen
# ⼲掉数字签名证书
ssl._create_default_https_context = ssl._create_unverified_context
def getPage(url):
    response = urlopen(url)
    return response.read().decode("utf-8")

def parsePage(d):
    com = re.compile(r'<!-- 热推标签、埋点 -->.*?data-is_focus="(?:1)?" data-sl="">(?P<title>.*?)</a>'
                     r'.*?data-el="region">(?P<xiaoqu>.*?)</a>'
                     r'.*?</span>(?P<huxing>.*?)<span'
                     r'.*?/</span>(?P<mianji>.*?)<span'
                     r'.*?/</span>(?P<chaoxiang>.*?)<span'
                     r'.*?/</span>(?P<zhuangxiu>.*?)<'
                     r'(?:span class="divide">/</span>(?P<dianti>.*?)<)?'#()?括号里的东西出现0次或1次 (?:)表示取消()权限:findall会优先把匹配结果组里内容返回,如果想要匹配结果,取消权限即可
                     r'.*?div class="positionInfo">(?P<flood>.*?)<span'
                     r'.*?/</span>(?P<floodtime>.*?)<span'
                     r'.*?target="_blank">(?P<diqu>.*?)</a>'
                     r'.*?class="followInfo">(?P<followInfo>.*?)<span'
                     r'.*?/</span>(?P<daikancishu>.*?)<div class="tag">'
                    r'(?:<span class="subway">(?P<subway>.*?)</span>)?'#可有可无
                    r'(?:<span class=".*?">(?P<fangben>.*?)</span>)?'#可有可无
                     r'(?:<span class="haskey">(?P<haskey>.*?)</span>)?'#可有可无
                     r'.*?<div class="totalPrice"><span>(?P<totalPrice>.*?)</div>'
                     r'.*?data-price=".*?"><span>(?P<unitPrice>.*?)</span>'
                     ,re.S)

    retsult=com.finditer(d)
    for i in retsult:
        yield {"title":i.group("title"),
                "xiaoqu": i.group("xiaoqu"),
                "huxing": i.group("huxing"),
                "mianji": i.group("mianji"),
                "chaoxiang": i.group("chaoxiang"),
                "zhuangxiu": i.group("zhuangxiu"),
               "dianti": i.group("dianti"),
                "flood": i.group("flood"),
                "floodtime": i.group("floodtime"),
                "diqu": i.group("diqu"),
                "followInfo": i.group("followInfo"),
                "daikancishu": i.group("daikancishu"),
               "subway": i.group("subway"),
                "fangben": i.group("fangben"),
               "haskey": i.group("haskey"),
                "totalPrice": re.sub("</span>","",i.group("totalPrice")),
                "unitPrice": i.group("unitPrice"),
                }

f = open("lianjia_Second-hand house_info", mode="a", encoding="utf-8")
for i in range(100):
    if i==0:
        url="https://bj.lianjia.com/ershoufang/"
    else:
        url = "https://bj.lianjia.com/ershoufang/"+"pg%s" % (i+1)
    print(url)
    ret = parsePage(getPage(url))
    for obj in ret:
        data = json.dumps(obj, ensure_ascii=False)
        print(data)
        f.write(data + "\n")
f.flush()
f.close()

 

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值