爬虫小项目

#使用面向过程的方式爬取链家网西安地区的楼盘新房信息

#获取源码,将源码保存在文件中,在文件中进行匹配  在网页中操作影响数据读取速度,所以先保存在文件中,在文件中进行匹配操作,节约时间
'''import requests
from bs4 import BeautifulSoup
url = 'https://xa.fang.lianjia.com/loupan/pg1/'
headers = {
'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
}
res = requests.get(url=url,headers=headers)
print(res.text)
file = open('lianjia.html','w',encoding='utf8')
file.write(res.text)
file.close()'''
进行匹配操作
from lxml import html
file = open('lianjia.html','r',encoding='utf8')
files = file.read()
print(files)
etree = html.etree
cont = etree.HTML(files)
s1 =cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/a/text()")
s2 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[1]/text()")
s3 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[2]/text()")
s4 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[1]/text()")
s5 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[2]/text()")
s6 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/a/text()")
s7 = cont.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[1]/text()")
s8 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-area']/span/text()")
s9 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='main-price']/span[1]/text()")
s10 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='second']/text()")
print(s1)
print(len(s1))
print(s2)
print(len(s2))
print(s3)
print(len(s3))
print(s4)
print(len(s4))
print(s5)
print(len(s5))
print(s6)
print(len(s6))
print(s7)
print(len(s7))
print(s8)
print(len(s8))
print(s9)
print(len(s9))
print(s10)
print(len(s10))
for i1,i2,i3,i4,i5,i6,i7,i8,i9,i10 in zip(s1,s2,s3,s4,s5,s6,s7,s8,s9,s10):
    conts = '楼盘名:%s\n房屋类型:%s\n是否在售:%s\n地址:%s %s %s\n户型:%s\n建筑面积:%s价格:%s/平米\n总价:%s\n\n'%(i1,i2,i3,i4,i5,i6,i7,i8,i9,i10)
    print(conts)
    f = open('loupan.txt','a',encoding='utf8')
    f.write(conts)
    f.close()

#完整代码

import requests
import time
from lxml import html
headers = {
'User-Agent':'Opera/9.80 (Windows NT 6.1; U; en) Presto/2.8.131 Version/11.11'
}
for i in range(1,101):
    po = 0
    url = 'https://xa.fang.lianjia.com/loupan/pg%d/'%i
    res = requests.get(url=url,headers=headers)
    etree = html.etree
    cont = etree.HTML(res.text)
    s1 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/a/text()")
    s2 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[1]/text()")
    s3 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-name']/span[2]/text()")
    s4 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[1]/text()")
    s5 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/span[2]/text()")
    s6 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-location']/a/text()")
    s7 = cont.xpath("//div[@class='resblock-desc-wrapper']/a[@class='resblock-room']/span[1]/text()")
    s8 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-area']/span/text()")
    s9 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='main-price']/span[1]/text()")
    # s10 = cont.xpath("//div[@class='resblock-desc-wrapper']/div[@class='resblock-price']/div[@class='second']/text()")
    for i1, i2, i3, i4, i5, i6, i7, i8, i9 in zip(s1, s2, s3, s4, s5, s6, s7, s8, s9):
        conts = '楼盘名:%s\n房屋类型:%s\n是否在售:%s\n地址:%s %s %s\n户型:%s\n建筑面积:%s价格:%s/平米\n' % (
        i1, i2, i3, i4, i5, i6, i7, i8, i9)
        #print(conts)
        f = open('loupan_totalsecond.txt', 'a', encoding='utf8')
        f.write(conts)
        f.close()
        time.sleep(0.1)
        po += 1
        print('爬取第%d页'%po)

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值