爬链家数据(武汉光谷房价)

爬链家数据

#-*- coding:utf-8-*-


import urllib
import urllib.request 
import re
from bs4 import BeautifulSoup
from itertools import chain
import xlwt
import re
import logging
import string


def dataid(pg):
    url = 'http://wh.lianjia.com/ershoufang/guanggu/pg'+str(pg)+'/l1'
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    request = urllib.request.Request(url,headers = headers)
    response = urllib.request.urlopen(request,timeout = 30)
    data = response.read().decode('utf-8').encode('utf-8')
    soup = BeautifulSoup(data)
    sdata = soup.find_all('li')
    for i in range(0,len(sdata)+1):
        sdata[i] = str(sdata[i])
    sdata = ''.join(list(sdata))
    con = 'data-id="(.*?)" '
    data = re.findall(con,sdata,re.S)
    return data


def xiangxiyemian(id):
    url = 'http://wh.lianjia.com/ershoufang/'+str(id)+'.html'
    user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
    headers = { 'User-Agent' : user_agent }
    request = urllib.request.Request(url,headers = headers)
    response = urllib.request.urlopen(request)
    data = response.read().decode('utf-8').encode('utf-8')
    xiansoup = BeautifulSoup(data)
    xiandata = xiansoup.find_all('div',{'class':'desc-text clear'})
    for i in range(0,len(xiandata)):
        xiandata[i] = str(xiandata[i])
    xiandata = ''.join(list(xiandata))
    #正则表达式 匹配项
    sj = '<strong class="ft-num">(.*?)</strong>' #售价
    mj = '<i>/ (.*?)</i>'                     #面积   
    dj = '<dd class="short">(.*?)</dd>'   #单价、首付、月供
    hx = '<dd>(.*?)</dd>'                 #户型、朝向、楼层、小区 户型为list[1],朝向list[2],楼层3 小区4
    shoujia = re.findall(sj,xiandata)
    mianji = re.findall(mj,xiandata)
    danjia = re.findall(dj,xiandata)
    huxing = re.findall(hx,xiandata)
    #print(shoujia,mianji,danjia,'\n')
    conn = '(.*?)<span class="region">.*?</span>(.*?)年'
    #for i in range (1,len(huxing)-1):
        #print (huxing[i],'\n')
    s = huxing[1:len(huxing)-1] #切片
    hu = re.findall(conn,huxing[4])
    #for i in range (0,2):
        #print (hu[0][i],'\n')
    #m = chain(shoujia,mianji,danjia,huxing[1:4],hu[0][0:2])#使用chain合并list,有没有更简单的方式合并?
    m = merge(shoujia,mianji)
    m = merge(m,danjia)
    m = merge(m,huxing[1:4])
    m = merge(m,hu[0][0:2])
    return m


#合并list
def merge(*lsts):
    """merge lists with for loop"""
    result = []
    for sublst in lsts:
        result.extend(sublst)
    return result
    
    #for n in m:
    #   print (n)






#操作excle
def createExcel():
    wbk = xlwt.Workbook()
    sheet  = wbk.add_sheet('sheet1')
    createXLSTitle(sheet) 
    for i in range(3):
        createXLS(sheet, i)
    wbk.save("e:/pythontest/t1.xls")
    
def createXLS(sheet,int):
    for j in range (1,len(dataid(int))):
        t = len(dataid(int-1))
        m = xiangxiyemian(dataid(int)[j])
        print (m)
        for s in range(0,len(m)):
            sheet.write(j+(int-1)*t,s,m[s])
def createXLSTitle(sheet):  
    sheet.write(0,0,"售价")  
    sheet.write(0,1,"面积")  
    sheet.write(0,2,"单价")
    sheet.write(0,3,"首付")  
    sheet.write(0,4,"月供")  
    sheet.write(0,5,"户型")
    sheet.write(0,6,"朝向")
    sheet.write(0,7,"楼层")  
    sheet.write(0,8,"小区名")  
    sheet.write(0,9,"年份")
    
    
if __name__ == '__main__':
    createExcel()


    






        


        
code by python 3 后续继续改进

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值