爬链家数据
#-*- coding:utf-8-*-
import urllib
import urllib.request
import re
from bs4 import BeautifulSoup
from itertools import chain
import xlwt
import re
import logging
import string
def dataid(pg):
url = 'http://wh.lianjia.com/ershoufang/guanggu/pg'+str(pg)+'/l1'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request = urllib.request.Request(url,headers = headers)
response = urllib.request.urlopen(request,timeout = 30)
data = response.read().decode('utf-8').encode('utf-8')
soup = BeautifulSoup(data)
sdata = soup.find_all('li')
for i in range(0,len(sdata)+1):
sdata[i] = str(sdata[i])
sdata = ''.join(list(sdata))
con = 'data-id="(.*?)" '
data = re.findall(con,sdata,re.S)
return data
def xiangxiyemian(id):
url = 'http://wh.lianjia.com/ershoufang/'+str(id)+'.html'
user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }
request = urllib.request.Request(url,headers = headers)
response = urllib.request.urlopen(request)
data = response.read().decode('utf-8').encode('utf-8')
xiansoup = BeautifulSoup(data)
xiandata = xiansoup.find_all('div',{'class':'desc-text clear'})
for i in range(0,len(xiandata)):
xiandata[i] = str(xiandata[i])
xiandata = ''.join(list(xiandata))
#正则表达式 匹配项
sj = '<strong class="ft-num">(.*?)</strong>' #售价
mj = '<i>/ (.*?)</i>' #面积
dj = '<dd class="short">(.*?)</dd>' #单价、首付、月供
hx = '<dd>(.*?)</dd>' #户型、朝向、楼层、小区 户型为list[1],朝向list[2],楼层3 小区4
shoujia = re.findall(sj,xiandata)
mianji = re.findall(mj,xiandata)
danjia = re.findall(dj,xiandata)
huxing = re.findall(hx,xiandata)
#print(shoujia,mianji,danjia,'\n')
conn = '(.*?)<span class="region">.*?</span>(.*?)年'
#for i in range (1,len(huxing)-1):
#print (huxing[i],'\n')
s = huxing[1:len(huxing)-1] #切片
hu = re.findall(conn,huxing[4])
#for i in range (0,2):
#print (hu[0][i],'\n')
#m = chain(shoujia,mianji,danjia,huxing[1:4],hu[0][0:2])#使用chain合并list,有没有更简单的方式合并?
m = merge(shoujia,mianji)
m = merge(m,danjia)
m = merge(m,huxing[1:4])
m = merge(m,hu[0][0:2])
return m
#合并list
def merge(*lsts):
"""merge lists with for loop"""
result = []
for sublst in lsts:
result.extend(sublst)
return result
#for n in m:
# print (n)
#操作excle
def createExcel():
wbk = xlwt.Workbook()
sheet = wbk.add_sheet('sheet1')
createXLSTitle(sheet)
for i in range(3):
createXLS(sheet, i)
wbk.save("e:/pythontest/t1.xls")
def createXLS(sheet,int):
for j in range (1,len(dataid(int))):
t = len(dataid(int-1))
m = xiangxiyemian(dataid(int)[j])
print (m)
for s in range(0,len(m)):
sheet.write(j+(int-1)*t,s,m[s])
def createXLSTitle(sheet):
sheet.write(0,0,"售价")
sheet.write(0,1,"面积")
sheet.write(0,2,"单价")
sheet.write(0,3,"首付")
sheet.write(0,4,"月供")
sheet.write(0,5,"户型")
sheet.write(0,6,"朝向")
sheet.write(0,7,"楼层")
sheet.write(0,8,"小区名")
sheet.write(0,9,"年份")
if __name__ == '__main__':
createExcel()
code by python 3 后续继续改进