16-爬取租房信息

最新推荐文章于 2022-03-18 18:34:23 发布

chuiai8582

最新推荐文章于 2022-03-18 18:34:23 发布

阅读量315

点赞数

文章标签：面试爬虫 python

原文链接：https://my.oschina.net/pansy0425/blog/3000220

版权

目的：爬取全国各地的租房信息

结果呈现：屏幕显示爬取完成；相应的文件夹的生成

注：昨天去面试的时候，考官问我有没有爬过关于租房信息的一些网站，我说没有，但是应该很简单，于是今天就写一个爬取租房信息的小代码吧~~~

注：昨天的面试竟然让我去河南郑州,,,,,,,对不起，我真不想离开南京o(╥﹏╥)o！！！！

58同城租房官网：https://www.58.com/zufang/changecity/

#下面为本实例的爬虫代码，若有问题可以给我留言，或者有更好的解决方法也可以私信我~

import re
import os
import requests
from bs4 import BeautifulSoup

def get_page(url):
    headers={'user-agent':'Mozilla/5.0'}
    try:
        r=requests.get(url,headers=headers)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except Exception as e:
        print(e)

def get_cate(url):
    cate_name=[]
    cate_url=[]
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')
    dl=soup.find('dl',{'id':{'clist'}})
    dts=dl('dt')
    dds=dl('dd')
    for i in range(1,len(dts)):
        name1=dts[i].text.strip()
        for item in dds[i]('a'):
            name2=item.text.strip()
            name=name1+'_'+name2
            url_t='https:'+item['href']
            cate_url.append(url_t)
            cate_name.append(name)
    return cate_url,cate_name

def get_num(url):
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')
    pager=soup.find('div',{'class':{'pager'}})
    a=pager('a')[-2].text.strip()
    return int(a)

def get_house_url(url,cate):
    house=[]
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')
    ul = soup.find('ul', {'class': {'listUl'}})
    divs=ul.find_all('div',{'class':{'des'}})
    for item in divs:
        href=item('h2')[0]('a')[0]['href']
        if href.endswith('desc'):#把以desc结尾的网址进行修改成qd开头的，不影响访问！
            entinfo=re.findall('entinfo=(\d+)_',href)[0]
            href='//qd.58.com/zufang/'+str(entinfo)+'x.shtml'
        house_url='https:'+href
        house.append(house_url)
    return house

def get_info(url,cate):  #保存到txt文本中
    path='./'+cate.split('_')[0]+'/'
    if not os.path.exists(path):
        os.makedirs(path)
    file=path+cate.split('_')[1]+'.txt'
    html=get_page(url)
    soup=BeautifulSoup(html,'html.parser')
    title=soup.find('div',{'class':{'house-title'}})('h1')[0].text.strip() #标题
    info1=soup.find('meta',{'name':{'description'}})['content']
    try:
        jg = re.findall('\d{4}', info1)[0]
    except:
        jg = re.findall('\d{3}', info1)[0]
    jg=jg+'元/每月'
    pay=soup.find('span',{'class':{'c_333'}}).text.strip()  #付款
    with open(file,'a+',encoding='utf-8')as f:
        f.write('标题：'+title+'\n')
        f.write('价格：'+jg+'\n')
        f.write('付款：'+pay+'\n')
    f.close()
    ul=soup.find('ul',{'class':{'f14'}})
    for li in ul('li'):
        span1=li('span')[0].text.strip()
        span2=li('span')[1].text.strip().replace('\xa0','').replace(' ','').replace('\n',' ')
        str=span1+span2
        with open(file,'a+',encoding='utf-8')as f:
            f.write(str+'\n')
        f.close()
    try:#房屋配置
        house=soup.find('ul',{'class':{'house-disposal'}}).text.strip().replace('\n',' ')
    except:
        house='无'
    with open(file,'a+',encoding='utf-8')as f:
        f.write('房屋配置：'+house)
    f.close()
    try:
        ul1=soup.find('ul',{'class':{'district-info-list'}})
        for li in ul1('li'):
            span1 = li('span')[0].text.strip()
            span2 = li('span')[1].text.strip().replace('\xa0', '').replace(' ', '').replace('\n', ' ')
            str = span1 + span2
            with open(file, 'a+', encoding='utf-8')as f:
                f.write(str + '\n')
            f.close()
    except:
        pass
    with open(file, 'a+', encoding='utf-8')as f:
        f.write('\n')
    f.close()
    print('{}--{}--{}--?！'.format(cate.split('_')[0],cate.split('_')[1],title))


if __name__ == '__main__':
    start_url='https://www.58.com/zufang/changecity/' #所有租房城市的网址
    cate_url,cate_name=get_cate(start_url)
    for i in range(len(cate_url)):#应该是len(cate_url)
        try:
            num=get_num(cate_url[i])
            for j in range(1, num+1):#应该是num+1
                page_url=cate_url[i]+'pn'+str(j)+'/'
                house=get_house_url(page_url,cate_name[i])
                for item in house:
                    get_info(item,cate_name[i])
        except Exception as e:
            print(e)

（1）屏幕显示【部分】