#使用面向过程的方式爬取链家网西安地区的楼盘新房信息
#获取源码,将源码保存在文件中,在文件中进行匹配 在网页中操作影响数据读取速度,所以先保存在文件中,在文件中进行匹配操作,节约时间 '''import requests from bs4 import BeautifulSoup url = 'https://xa.fang.lianjia.com/loupan/pg1/' headers = { 'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11' } res = requests.get(url=url,headers=headers) print(res.text) file = open('lianjia.html','w',encoding='utf8') file.write(res.text) file.close()'''
进行匹配操作 from lxml import html file = open('lianjia.html','r',encoding='utf8') files = file.read() print(files) etree = html.etree cont = etree.HTML(files) s1 =cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/a/text()") s2 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[1]/text()") s3 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[2]/text()") s4 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[1]/text()") s5 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[2]/text()") s6 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/a/text()") s7 = cont.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[1]/text()") s8 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-area']/span/text()") s9 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='main-price']/span[1]/text()") s10 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='second']/text()") print(s1) print(len(s1)) print(s2) print(len(s2)) print(s3) print(len(s3)) print(s4) print(len(s4)) print(s5) print(len(s5)) print(s6) print(len(s6)) print(s7) print(len(s7)) print(s8) print(len(s8)) print(s9) print(len(s9)) print(s10) print(len(s10)) for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in zip(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10): conts = '楼盘名:%s\n房屋类型:%s\n是否在售:%s\n地址:%s %s %s\n户型:%s\n建筑面积:%s价格:%s/平米\n总价:%s\n\n'%(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10) print(conts) f = open('loupan.txt','a',encoding='utf8') f.write(conts) f.close()
#完整代码
import requests import time from lxml import html headers = { 'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11' } for i in range(1,101): po = 0 url = 'https://xa.fang.lianjia.com/loupan/pg%d/'%i res = requests.get(url=url,headers=headers) etree = html.etree cont = etree.HTML(res.text) s1 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/a/text()") s2 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[1]/text()") s3 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[2]/text()") s4 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[1]/text()") s5 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[2]/text()") s6 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/a/text()") s7 = cont.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[1]/text()") s8 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-area']/span/text()") s9 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='main-price']/span[1]/text()") # s10 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='second']/text()") for i1, i2, i3, i4, i5, i6, i7, i8, i9 in zip(s1, s2, s3, s4, s5, s6, s7, s8, s9): conts = '楼盘名:%s\n房屋类型:%s\n是否在售:%s\n地址:%s %s %s\n户型:%s\n建筑面积:%s价格:%s/平米\n' % ( i1, i2, i3, i4, i5, i6, i7, i8, i9) #print(conts) f = open('loupan_totalsecond.txt', 'a', encoding='utf8') f.write(conts) f.close() time.sleep(0.1) po += 1 print('爬取第%d页'%po)