requests 获取div_获取成都天府新区二手房信息并邮件发送

最新推荐文章于 2020-12-11 02:57:01 发布

weixin_39683025

最新推荐文章于 2020-12-11 02:57:01 发布

阅读量2.4k

点赞数

文章标签： requests 获取div

本文链接：https://blog.csdn.net/weixin_39683025/article/details/111614018

版权

获取房源详细信息时,先分析结构(天府新区)

①：

起始页面：

https://cd.lianjia.com/ershoufang/tianfuxinqu/

https://cd.lianjia.com/ershoufang/tianfuxinqu/pg1

第二页：

https://cd.lianjia.com/ershoufang/tianfuxinqu/pg2/

所有页面均为尾部 +pg**

生成指定区间页面

  url = 'https://cd.lianjia.com/ershoufang/tianfuxinqu/pg'  start_page = int(input('\n请输入起始页码：'))  end_page = int(input('\n请输入结束页码：'))  for i in range(start_page,end_page+1):      page_queue.put(url + str(i) + "/")

②：

根据所得页面解析房源具体链接

用BeautifulSoup获取

#导入模块

from bs4 import BeautifulSoup as BS

    def parse_page(self,url):        try:            soup=BS(requests.get(url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser')   #实例化成对象            tr_list = soup.select('.sellListContent > li')            for li in range(len(tr_list)):                self.url_queue.put(tr_list[li].a['href'])                print(tr_list[li].a['href'])            #获取指定页面所有房源链接        except Exception as a:            print(a)

③：

根据房源具体链接获得数据：

  #以下为解析网页获取内容  new_url = self.url_queue.get()  xx_soup=BS(requests.get(new_url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser')  lp_name=xx_soup.select('.communityName > a')[0].text                       #楼盘名称  lp_address = '-'.join(xx_soup.select('.areaName > span')[1].text.split())  # 楼盘位置  lp_jiedai = xx_soup.select('.brokerName > a')[0].text                      #接待  lp_jdph = xx_soup.select('.phone')[1].text[:-6]                            #接待电话  lp_gptim = xx_soup.select('#introduction > div > div > .transaction > div > ul > li > span')[1].text                  #挂牌时间  lp_dyqq = re.sub('[\n ]', '',xx_soup.select('#introduction > div > div > .transaction > div > ul > li')[6].text)[4:]  #有无抵押  lp_sprice=xx_soup.select('.price > span')[0].text + '万元'  #总价  lp_price=xx_soup.select('.unitPrice > span')[0].text        #单价  lp_tslc=xx_soup.select('.room > div')                    #楼层与厅室lp_tslc分别取[0]、[1]  lp_chaoxzx=xx_soup.select('.houseInfo > .type > div')    #朝向与装修lp_chaoxzx分别取[0]、[1]  lp_sjieg=xx_soup.select('.houseInfo > .area > div')      #面积与结构lp_chaoxzx分别取[0]、[1]  lp_dt=xx_soup.select('.introContent > div > .content > ul > li')[10].text   #获取电梯情况列表  lp_dt=re.sub('[交易权属商品房\n]+','配备电梯暂无数据',lp_dt)                  #替换列表中废弃物  lp_other=xx_soup.select('.content > a')                                     #楼盘其它信息  lp_other='-'.join(re.sub('[a-z<>":/.0-9=_,[\]]','',str(lp_other)).split())

④：

保存并邮件发送文件

  wj_name='esf.xls'   #创建esf.xls文件  output = open(wj_name, 'a+', encoding='utf-8')     #xls写入时  #xls写入时\t相当于tab键  output.write('A楼盘挂牌日\tB楼盘信息\tC均价/总价\tD楼盘位置\tE顾问\tF接待电话\tG楼层修筑日期结构\tH楼盘其它信息\tI抵押情况\tj原地址\n')  output.close()  with open(wj_name, 'a+',encoding='utf-8') as f:    #xls写入      for msg in self.items:          for x_msg in msg:              f.write(x_msg + '\t')          f.write('\n')

邮件发送时需自行在邮箱神奇授权码：(此处qq邮箱)

qq邮箱授权码申请：

qq邮箱-设置-账户下开启IMAP/SMTP服务即可

保存好授权码不要泄漏

  #开始发送邮件(附件)  mail_user = "13579111315@qq.com"  mail_pass = "jcxxmwvfxruzh***"     #授权码自己在qq邮箱申请  receivers = ['735324273@qq.com']   #收件箱地址可以多个，使用 , 隔开  msg = MIMEMultipart()  msg["Subject"] = wj_name        #附件标题  msg["From"] = 'snowing'         #发件人备注  msg["To"] = 'summer'            #收件人备注  part = MIMEText("哔哔")         #邮件文字部分  msg.attach(part)  part = MIMEApplication(open(wj_name, 'rb').read())  part.add_header('Content-Disposition', 'attachment', filename=wj_name)  msg.attach(part)  s = smtplib.SMTP("smtp.qq.com", timeout=22)         #连接smtp邮件服务器,端口默认是25  s.login(mail_user, mail_pass)                       #登陆服务器  s.sendmail(mail_user, receivers, msg.as_string())   #发送邮件  print('发送成功')  os.remove(wj_name)                                  #删除文件  s.quit()

完整代码如下：

import reimport osimport timeimport queueimport randomimport smtplibimport requestsimport threading#import pandas as pdfrom bs4 import BeautifulSoup as BSfrom email.mime.text import MIMETextfrom email.mime.multipart import MIMEMultipartfrom email.mime.application import MIMEApplicationclass analysis_url(threading.Thread):    def __init__(self,page_queue,url_queue,USER_AGENT,*args,**kwargs):        super(analysis_url,self).__init__(*args,**kwargs)        self.page_queue=page_queue        self.url_queue=url_queue        self.USER_AGENT=USER_AGENT    def run(self):        while 1:            if self.page_queue.empty():     #判断如果page_queue队列为空则停止                break            url=self.page_queue.get()       #从page_queue依次取出一个网页链接            self.parse_page(url)    def parse_page(self,url):        try:            soup=BS(requests.get(url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser')    #实例化成对象            tr_list = soup.select('.sellListContent > li')            for li in range(len(tr_list)):                self.url_queue.put(tr_list[li].a['href'])                #print(tr_list[li].a['href'])        except Exception as a:            print(a)class saved_message(threading.Thread):    def __init__(self,page_queue,url_queue,xxi_queue,items,USER_AGENT,*args,**kwargs):        super(saved_message,self).__init__(*args,**kwargs)        self.page_queue=page_queue        self.url_queue=url_queue        self.xxi_queue=xxi_queue        self.items=items        self.USER_AGENT=USER_AGENT    def run(self):        while 1:            if self.url_queue.empty():  #如果url_queue空则停止                break            try:                #以下解析网页获取内容                new_url = self.url_queue.get()                xx_soup=BS(requests.get(new_url,headers={'User-Agent': random.choice(self.USER_AGENT)}).text,'html.parser')                lp_name=xx_soup.select('.communityName > a')[0].text                       #楼盘名称                #print(lp_name)                lp_address = '-'.join(xx_soup.select('.areaName > span')[1].text.split())  # 楼盘位置                #print('-'.join(lp_address.split()))                #print(lp_address)                lp_jiedai = xx_soup.select('.brokerName > a')[0].text                      #接待                lp_jdph = xx_soup.select('.phone')[1].text[:-6]                            #接待电话                #lp_jiedai,lp_jdph=xx_soup.select('.brokerName > a')[0].text,xx_soup.select('.phone')[1].text[:-6]   #接待与电话                #print(lp_jiedai,lp_jdph)                lp_gptim = xx_soup.select('#introduction > div > div > .transaction > div > ul > li > span')[1].text                  #挂牌时间                #print(lp_gptim)                lp_dyqq = re.sub('[\n ]', '',xx_soup.select('#introduction > div > div > .transaction > div > ul > li')[6].text)[4:]  #有无抵押                #print(lp_dyqq)                lp_sprice=xx_soup.select('.price > span')[0].text + '万元'  #总价                #lp_sprice=re.sub('[a-z[\]<>=",/\r]+','',str(xx_soup.select('.price > span')))                #print(''.join(lp_sprice.split()))                lp_price=xx_soup.select('.unitPrice > span')[0].text        #单价                #print(lp_sprice,lp_price)                lp_tslc=xx_soup.select('.room > div')                    #楼层与厅室lp_tslc分别取[0]、[1]                lp_chaoxzx=xx_soup.select('.houseInfo > .type > div')    #朝向与装修lp_chaoxzx分别取[0]、[1]                lp_sjieg=xx_soup.select('.houseInfo > .area > div')      #面积与结构lp_chaoxzx分别取[0]、[1]                lp_dt=xx_soup.select('.introContent > div > .content > ul > li')[10].text   #获取电梯情况列表                lp_dt=re.sub('[交易权属商品房\n]+','配备电梯暂无数据',lp_dt)                #替换列表中废弃物                lp_other=xx_soup.select('.content > a')                                     #楼盘其它信息                lp_other='-'.join(re.sub('[a-z<>":/.0-9=_,[\]]','',str(lp_other)).split())                #print(lp_other)                name=(lp_name, lp_address, lp_jiedai, lp_jdph, lp_gptim, lp_dyqq, lp_price, lp_sprice,lp_tslc[0].text,                      lp_tslc[1].text, lp_chaoxzx[0].text, lp_chaoxzx[1].text, lp_sjieg[0].text,lp_sjieg[1].text, lp_dt, lp_other, new_url)                #print(name[0],name[2],name[3],name[16])                print('完成解析网页并保存数据',name[16])                #此处无pandas模块时写入excel                item = [name[4],name[0] + '-' + name[8] + '-' + name[12] + '-' + name[10] + '-' + name[11] + '-' + name[14],                        name[6] + '-' + name[7],name[1], name[2], name[3], name[9] + '/' + name[13], name[15], name[5], name[16]]                #此处使用pandas模块时                #item = {                #    'A楼盘挂牌日': name[4],                #    'B楼盘信息': name[0] + '-' + name[8] + '-' + name[12] + '-' + name[10] + '-' + name[11] + '-' + name[                #        14],                #    'C均价/总价': name[6] + '-' + name[7],                #    'D楼盘位置': name[1],                #    'E顾问': name[2],                #    'F接待电话': name[3],                #    'G楼层修筑日期结构': name[9] + '/' + name[13],                #    'H楼盘其它信息': name[15],                #    'I抵押情况': name[5],                #    'j原地址': name[16],                #}                #此处使用pandas模块时写入                self.items.append(item)                #print(self.items)                #self.down_msg()     #使用pandas模块写入数据是开启次函数            except Exception as a:                print(a)    #此处有pandas模块时    #def down_msg(self):    #  try:    #     df = pd.DataFrame(self.items)    #     #print(df)    #     df.to_csv('BBJ.xls', index=False, sep=',', encoding='utf-8-sig')    #  except Exception as a:    #     pass#无pandas模块时class down_message(threading.Thread):    def __init__(self,items,s_time,*args,**kwargs):        super(down_message,self).__init__(*args,**kwargs)        self.items=items        self.s_time=s_time    def run(self):        #无pandas模块时        wj_name='esf.xls'   #创建esf.xls文件        output = open(wj_name, 'a+', encoding='utf-8')     #xls写入时        #output = open('esf.csv', 'a+', encoding='gbk')    #csv写入时        #xls写入时\t相当于tab键        output.write('A楼盘挂牌日\tB楼盘信息\tC均价/总价\tD楼盘位置\tE顾问\tF接待电话\tG楼层修筑日期结构\tH楼盘其它信息\tI抵押情况\tj原地址\n')        #csv写入时        #output.write('A楼盘挂牌日,B楼盘信息,C均价/总价,D楼盘位置,E顾问,F接待电话,G楼层修筑日期结构,H楼盘其它信息,I抵押情况,j原地址\n')        output.close()        with open(wj_name, 'a+',encoding='utf-8') as f:   #xls写入        #with open(wj_name,'a+',encoding='utf-8') as f:   #csv写入            for msg in self.items:                #xls使用                for x_msg in msg:                    #print(x_msg)                    f.write(x_msg + '\t')                f.write('\n')                #csv使用                #print(msg)                #f.write(re.sub('[[\]\' ]+', '', str(msg)) + '\n')        print('保存文件完毕 耗时', time.time() - self.s_time)        print('开始发送邮件')        #开始发送邮件(附件)        mail_user = "13579111315@qq.com"        mail_pass = "jcxxmwvfxr******"     #授权码自己在qq邮箱申请        receivers = ['735324273@qq.com']   #收件箱地址可以多个，使用 , 隔开        msg = MIMEMultipart()        msg["Subject"] = wj_name        #附件标题        msg["From"] = 'snowing'         #发件人备注        msg["To"] = 'summer'            #收件人备注        part = MIMEText("哔哔")         #邮件文字部分        msg.attach(part)        #xlsx类型附件        part = MIMEApplication(open(wj_name, 'rb').read())        part.add_header('Content-Disposition', 'attachment', filename=wj_name)        msg.attach(part)        s = smtplib.SMTP("smtp.qq.com", timeout=22)         #连接smtp邮件服务器,端口默认是25        s.login(mail_user, mail_pass)                       #登陆服务器        s.sendmail(mail_user, receivers, msg.as_string())   #发送邮件        print('发送成功')        os.remove(wj_name)                                  #删除文件        s.quit()def main():    s_time=time.time()               #开始时间    page_queue=queue.Queue()         #创建队列存放页面url    url_queue = queue.Queue()        #创建队列存放page页面每一部的url    xxi_queue = queue.Queue()        #创建队列存放详细信息    threads=[]    threads_a=[]    items=[]    USER_AGENT = [        "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0)",        "Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0)",        "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Maxthon 2.0)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; TencentTraveler 4.0)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; The World)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Avant Browser)",        "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)",    ]    url = 'https://cd.lianjia.com/ershoufang/tianfuxinqu/pg'    start_page = int(input('\n请输入起始页码：'))    end_page = int(input('\n请输入结束页码：'))    for i in range(start_page,end_page+1):        page_queue.put(url + str(i) + "/")    print('-' * 60 + '正在解析页面所有详细链接' + '-' * 60)    for x in range(50):      #创建解析文件线程        t=analysis_url(page_queue,url_queue,USER_AGENT)        threads.append(t)    for t in threads:        #出现顺序为错乱        time.sleep(0.05)        t.start()    for t in threads:        t.join()    print('-' * 60 +'解析页面所有详细链接结束'+ '-' * 60)    for x in range(100):     #创建下载信息线程        t = saved_message(page_queue,url_queue,xxi_queue,items,USER_AGENT)        threads_a.append(t)    for t in threads_a:        time.sleep(0.05)        t.start()    for t in threads_a:        t.join()    for x in range(1):        #创建保存到excel文件        t = down_message(items,s_time)        t.start()if __name__=='__main__':    main()

若需要源文件，直接回复二手房源文件，看到会为亲们提供

weixin_39683025

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫