获取房源详细信息时,先分析结构(天府新区)
①:
起始页面:
https://cd.lianjia.com/ershoufang/tianfuxinqu/
II
https://cd.lianjia.com/ershoufang/tianfuxinqu/pg1
第二页:
https://cd.lianjia.com/ershoufang/tianfuxinqu/pg2/
所有页面均为尾部 +pg**
生成指定区间页面
url = 'https://cd.lianjia.com/ershoufang/tianfuxinqu/pg' start_page = int(input('\n请输入起始页码:')) end_page = int(input('\n请输入结束页码:')) for i in range(start_page,end_page+1): page_queue.put(url + str(i) + "/")
②:
根据所得页面解析房源具体链接
用BeautifulSoup获取
#导入模块
from bs4 import BeautifulSoup as BS
def parse_page(self,url): try: soup=BS(requests.get(url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser') #实例化成对象 tr_list = soup.select('.sellListContent > li') for li in range(len(tr_list)): self.url_queue.put(tr_list[li].a['href']) print(tr_list[li].a['href']) #获取指定页面所有房源链接 except Exception as a: print(a)
③:
根据房源具体链接获得数据:
#以下为解析网页获取内容 new_url = self.url_queue.get() xx_soup=BS(requests.get(new_url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser') lp_name=xx_soup.select('.communityName > a')[0].text #楼盘名称 lp_address = '-'.join(xx_soup.select('.areaName > span')[1].text.split()) # 楼盘位置 lp_jiedai = xx_soup.select('.brokerName > a')[0].text #接待 lp_jdph = xx_soup.select('.phone')[1].text[:-6] #接待电话 lp_gptim = xx_soup.select('#introduction > div > div > .transaction > div > ul > li > span')[1].text #挂牌时间 lp_dyqq = re.sub('[\n ]', '',xx_soup.select('#introduction > div > div > .transaction > div > ul > li')[6].text)[4:] #有无抵押 lp_sprice=xx_soup.select('.price > span')[0].text + '万元' #总价 lp_price=xx_soup.select('.unitPrice > span')[0].text #单价 lp_tslc=xx_soup.select('.room > div') #楼层与厅室lp_tslc分别取[0]、[1] lp_chaoxzx=xx_soup.select('.houseInfo > .type > div') #朝向与装修lp_chaoxzx分别取[0]、[1] lp_sjieg=xx_soup.select('.houseInfo > .area > div') #面积与结构lp_chaoxzx分别取[0]、[1] lp_dt=xx_soup.select('.introContent > div > .content > ul > li')[10].text #获取电梯情况列表 lp_dt=re.sub('[交易权属商品房\n]+','配备电梯暂无数据',lp_dt) #替换列表中废弃物 lp_other=xx_soup.select('.content > a') #楼盘其它信息 lp_other='-'.join(re.sub('[a-z<>":/.0-9=_,[\]]','',str(lp_other)).split())
④:
保存并邮件发送文件
wj_name='esf.xls' #创建esf.xls文件 output = open(wj_name, 'a+', encoding='utf-8') #xls写入时 #xls写入时\t相当于tab键 output.write('A楼盘挂牌日\tB楼盘信息\tC均价/总价\tD楼盘位置\tE顾问\tF接待电话\tG楼层修筑日期结构\tH楼盘其它信息\tI抵押情况\tj原地址\n') output.close() with open(wj_name, 'a+',encoding='utf-8') as f: #xls写入 for msg in self.items: for x_msg in msg: f.write(x_msg + '\t') f.write('\n')
邮件发送时需自行在邮箱神奇授权码:(此处qq邮箱)
qq邮箱授权码申请:
qq邮箱-设置-账户 下开启IMAP/SMTP服务即可
保存好授权码不要泄漏
#开始发送邮件(附件) mail_user = "13579111315@qq.com" mail_pass = "jcxxmwvfxruzh***" #授权码自己在qq邮箱申请 receivers = ['735324273@qq.com'] #收件箱地址可以多个,使用 , 隔开 msg = MIMEMultipart() msg["Subject"] = wj_name #附件标题 msg["From"] = 'snowing' #发件人备注 msg["To"] = 'summer' #收件人备注 part = MIMEText("哔哔") #邮件文字部分 msg.attach(part) part = MIMEApplication(open(wj_name, 'rb').read()) part.add_header('Content-Disposition', 'attachment', filename=wj_name) msg.attach(part) s = smtplib.SMTP("smtp.qq.com", timeout=22) #连接smtp邮件服务器,端口默认是25 s.login(mail_user, mail_pass) #登陆服务器 s.sendmail(mail_user, receivers, msg.as_string()) #发送邮件 print('发送成功') os.remove(wj_name) #删除文件 s.quit()
完整代码如下:
import reimport osimport timeimport queueimport randomimport smtplibimport requestsimport threading#import pandas as pdfrom bs4 import BeautifulSoup as BSfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom email.mime.application import MIMEApplicationclass analysis_url(threading.Thread): def __init__(self,page_queue,url_queue,USER_AGENT,*args,**kwargs): super(analysis_url,self).__init__(*args,**kwargs) self.page_queue=page_queue self.url_queue=url_queue self.USER_AGENT=USER_AGENT def run(self): while 1: if self.page_queue.empty(): #判断如果page_queue队列为空则停止 break url=self.page_queue.get() #从page_queue依次取出一个网页链接 self.parse_page(url) def parse_page(self,url): try: soup=BS(requests.get(url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser') #实例化成对象 tr_list = soup.select('.sellListContent > li') for li in range(len(tr_list)): self.url_queue.put(tr_list[li].a['href']) #print(tr_list[li].a['href']) except Exception as a: print(a)class saved_message(threading.Thread): def __init__(self,page_queue,url_queue,xxi_queue,items,USER_AGENT,*args,**kwargs): super(saved_message,self).__init__(*args,**kwargs) self.page_queue=page_queue self.url_queue=url_queue self.xxi_queue=xxi_queue self.items=items self.USER_AGENT=USER_AGENT def run(self): while 1: if self.url_queue.empty(): #如果url_queue空则停止 break try: #以下解析网页获取内容 new_url = self.url_queue.get() xx_soup=BS(requests.get(new_url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser') lp_name=xx_soup.select('.communityName > a')[0].text #楼盘名称 #print(lp_name) lp_address = '-'.join(xx_soup.select('.areaName > span')[1].text.split()) # 楼盘位置 #print('-'.join(lp_address.split())) #print(lp_address) lp_jiedai = xx_soup.select('.brokerName > a')[0].text #接待 lp_jdph = xx_soup.select('.phone')[1].text[:-6] #接待电话 #lp_jiedai,lp_jdph=xx_soup.select('.brokerName > a')[0].text,xx_soup.select('.phone')[1].text[:-6] #接待与电话 #print(lp_jiedai,lp_jdph) lp_gptim = xx_soup.select('#introduction > div > div > .transaction > div > ul > li > span')[1].text #挂牌时间 #print(lp_gptim) lp_dyqq = re.sub('[\n ]', '',xx_soup.select('#introduction > div > div > .transaction > div > ul > li')[6].text)[4:] #有无抵押 #print(lp_dyqq) lp_sprice=xx_soup.select('.price > span')[0].text + '万元' #总价 #lp_sprice=re.sub('[a-z[\]<>=",/\r]+','',str(xx_soup.select('.price > span'))) #print(''.join(lp_sprice.split())) lp_price=xx_soup.select('.unitPrice > span')[0].text #单价 #print(lp_sprice,lp_price) lp_tslc=xx_soup.select('.room > div') #楼层与厅室lp_tslc分别取[0]、[1] lp_chaoxzx=xx_soup.select('.houseInfo > .type > div') #朝向与装修lp_chaoxzx分别取[0]、[1] lp_sjieg=xx_soup.select('.houseInfo > .area > div') #面积与结构lp_chaoxzx分别取[0]、[1] lp_dt=xx_soup.select('.introContent > div > .content > ul > li')[10].text #获取电梯情况列表 lp_dt=re.sub('[交易权属商品房\n]+','配备电梯暂无数据',lp_dt) #替换列表中废弃物 lp_other=xx_soup.select('.content > a') #楼盘其它信息 lp_other='-'.join(re.sub('[a-z<>":/.0-9=_,[\]]','',str(lp_other)).split()) #print(lp_other) name=(lp_name, lp_address, lp_jiedai, lp_jdph, lp_gptim, lp_dyqq, lp_price, lp_sprice,lp_tslc[0].text, lp_tslc[1].text, lp_chaoxzx[0].text, lp_chaoxzx[1].text, lp_sjieg[0].text,lp_sjieg[1].text, lp_dt, lp_other, new_url) #print(name[0],name[2],name[3],name[16]) print('完成解析网页并保存数据',name[16]) #此处无pandas模块时写入excel item = [name[4],name[0] + '-' + name[8] + '-' + name[12] + '-' + name[10] + '-' + name[11] + '-' + name[14], name[6] + '-' + name[7],name[1], name[2], name[3], name[9] + '/' + name[13], name[15], name[5], name[16]] #此处使用pandas模块时 #item = { # 'A楼盘挂牌日': name[4], # 'B楼盘信息': name[0] + '-' + name[8] + '-' + name[12] + '-' + name[10] + '-' + name[11] + '-' + name[ # 14], # 'C均价/总价': name[6] + '-' + name[7], # 'D楼盘位置': name[1], # 'E顾问': name[2], # 'F接待电话': name[3], # 'G楼层修筑日期结构': name[9] + '/' + name[13], # 'H楼盘其它信息': name[15], # 'I抵押情况': name[5], # 'j原地址': name[16], #} #此处使用pandas模块时写入 self.items.append(item) #print(self.items) #self.down_msg() #使用pandas模块写入数据是开启次函数 except Exception as a: print(a) #此处有pandas模块时 #def down_msg(self): # try: # df = pd.DataFrame(self.items) # #print(df) # df.to_csv('BBJ.xls', index=False, sep=',', encoding='utf-8-sig') # except Exception as a: # pass#无pandas模块时class down_message(threading.Thread): def __init__(self,items,s_time,*args,**kwargs): super(down_message,self).__init__(*args,**kwargs) self.items=items self.s_time=s_time def run(self): #无pandas模块时 wj_name='esf.xls' #创建esf.xls文件 output = open(wj_name, 'a+', encoding='utf-8') #xls写入时 #output = open('esf.csv', 'a+', encoding='gbk') #csv写入时 #xls写入时\t相当于tab键 output.write('A楼盘挂牌日\tB楼盘信息\tC均价/总价\tD楼盘位置\tE顾问\tF接待电话\tG楼层修筑日期结构\tH楼盘其它信息\tI抵押情况\tj原地址\n') #csv写入时 #output.write('A楼盘挂牌日,B楼盘信息,C均价/总价,D楼盘位置,E顾问,F接待电话,G楼层修筑日期结构,H楼盘其它信息,I抵押情况,j原地址\n') output.close() with open(wj_name, 'a+',encoding='utf-8') as f: #xls写入 #with open(wj_name,'a+',encoding='utf-8') as f: #csv写入 for msg in self.items: #xls使用 for x_msg in msg: #print(x_msg) f.write(x_msg + '\t') f.write('\n') #csv使用 #print(msg) #f.write(re.sub('[[\]\' ]+', '', str(msg)) + '\n') print('保存文件完毕 耗时', time.time() - self.s_time) print('开始发送邮件') #开始发送邮件(附件) mail_user = "13579111315@qq.com" mail_pass = "jcxxmwvfxr******" #授权码自己在qq邮箱申请 receivers = ['735324273@qq.com'] #收件箱地址可以多个,使用 , 隔开 msg = MIMEMultipart() msg["Subject"] = wj_name #附件标题 msg["From"] = 'snowing' #发件人备注 msg["To"] = 'summer' #收件人备注 part = MIMEText("哔哔") #邮件文字部分 msg.attach(part) #xlsx类型附件 part = MIMEApplication(open(wj_name, 'rb').read()) part.add_header('Content-Disposition', 'attachment', filename=wj_name) msg.attach(part) s = smtplib.SMTP("smtp.qq.com", timeout=22) #连接smtp邮件服务器,端口默认是25 s.login(mail_user, mail_pass) #登陆服务器 s.sendmail(mail_user, receivers, msg.as_string()) #发送邮件 print('发送成功') os.remove(wj_name) #删除文件 s.quit()def main(): s_time=time.time() #开始时间 page_queue=queue.Queue() #创建队列存放页面url url_queue = queue.Queue() #创建队列存放page页面每一部的url xxi_queue = queue.Queue() #创建队列存放详细信息 threads=[] threads_a=[] items=[] USER_AGENT = [ "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)", "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)", ] url = 'https://cd.lianjia.com/ershoufang/tianfuxinqu/pg' start_page = int(input('\n请输入起始页码:')) end_page = int(input('\n请输入结束页码:')) for i in range(start_page,end_page+1): page_queue.put(url + str(i) + "/") print('-' * 60 + '正在解析页面所有详细链接' + '-' * 60) for x in range(50): #创建解析文件线程 t=analysis_url(page_queue,url_queue,USER_AGENT) threads.append(t) for t in threads: #出现顺序为错乱 time.sleep(0.05) t.start() for t in threads: t.join() print('-' * 60 +'解析页面所有详细链接结束'+ '-' * 60) for x in range(100): #创建下载信息线程 t = saved_message(page_queue,url_queue,xxi_queue,items,USER_AGENT) threads_a.append(t) for t in threads_a: time.sleep(0.05) t.start() for t in threads_a: t.join() for x in range(1): #创建保存到excel文件 t = down_message(items,s_time) t.start()if __name__=='__main__': main()
若需要源文件,直接回复 二手房源文件,看到会为亲们提供