链家二手房在售爬虫

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 21:13:13 2019

@author: 盗号
"""

# -*- coding: utf-8 -*-
"""
Created on Mon Mar 25 10:10:22 2019

@author: zhang-peng
"""
# -*- coding: utf-8 -*-
import bs4 
import requests
import time#引入time,计算下载时间


def open_url(url):
    hd = {}
    hd['User-Agent'] = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.109 Safari/537.36'
    r = requests.get(url,headers=hd,timeout=10)
    return r

host = 'https://ty.lianjia.com/ershoufang/pg'

whvj = []
ee = []
aa = []
bb = []
cc = []
dd = []
count = 0

start = time.time()
size = 0
q = 100 


while count < q:
    url = host + str(count)
    r = open_url(url)
    soup = bs4.BeautifulSoup(r.text,'html.parser')
    count = count + 1
    
    
    targets = soup.find_all('a',class_="img")
    
    for each in targets:
        whvj.append(each['href'])
    
    print('\r'+"已经下载:"+int(count/q*100)*"█"+"【"+str(round(float(count/q)*100,2))+"%"+"】",end="")
#    print(url)
#    print(whvj)
whvj = list(set(whvj))
count1 = 0 
response = requests.get(url,stream = True)#stream参数设置成True时,它不会立即开始下载,当你使用iter_content或iter_lines遍历内容或访问内容属性时才开始下载
chunk_size = 1024#每次块大小为1024
content_size = int(len(whvj))
for i in whvj:
    soup1 = bs4.BeautifulSoup(open_url(i).text,'html.parser')
    title = soup1.find_all('h1',class_='main')
    aa.append(title[0].text.split())
    xbxi = soup1.find_all("div",class_="content")

    bb.append(xbxi[2].text.split())
#    cc.append(xbxi[3].text.split())
    dd.append(soup1.find_all("span",class_="info")[0].text.split())
    jxge = soup1.find_all('div',class_='price')
    for i in jxge:
        ee.append(i.text)
    size = size +1 
    print('\r'+"已经下载:"+int(size/content_size*100)*"█"+" 【"+str(round(size/chunk_size/1024,2))+"MB】"+"【"+str(round(float(size/content_size)*100,2))+"%"+"】",end="")

##
result = []
length =len(whvj)
for i in range(length):
    result.append(str(dd[i])+'^'  +str(aa[i])+'^'+str(whvj[i])+ '^' + str(ee[i])
                    + '^' + str(bb[i]) + '^' +  '\n')
    
end = time.time()
print("总耗时:"+str(end-start)+"秒")

#data_count = 0

with open('ty.txt','w',encoding='utf-8') as f:
    for each in result:
        f.write(each)
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值