python爬虫爬取58同城租房信息（使用动态IP）输出Excel文件

最新推荐文章于 2024-05-01 21:42:57 发布

lxxswd

最新推荐文章于 2024-05-01 21:42:57 发布

阅读量1.8k

点赞数 2

分类专栏： python 文章标签： python 爬虫数据挖掘

本文链接：https://blog.csdn.net/lxxswd/article/details/116208418

版权

python 专栏收录该内容

1 篇文章 0 订阅

订阅专栏

python爬虫爬取58同城信息（使用动态IP）

新手，为了做一个数据分析，搞了几天，终于搞出来了，大家可以给点意见啊。

# coding=utf-8
import sys
import csv
import time
from importlib import reload
import xlwt
import requests
from bs4 import BeautifulSoup
from goto import with_goto
reload(sys)
# 请求头设置
#使用win自带的浏览器的agent
User_Agent = 'your-agent'
headers = {
    'User-Agent': User_Agent,
}
def download(url):
     File = open('filename', 'w')
     excel = xlwt.Workbook(encoding='utf-8')
     sheet = excel.add_sheet('shee1')
     sheet.write(0, 0, '标题')
     sheet.write(0, 1, '房子')
     sheet.write(0, 2, '大小')
     sheet.write(0, 3, '地址1')
     sheet.write(0, 4, '地址2')
     sheet.write(0, 5, '其他')
     sheet.write(0, 6, '价格')
     sheet.write(0, 7, '地区')
     sheet.write(0, 8, '其他2')
     db_data = requests.get(url)
     soup = BeautifulSoup(db_data.text, 'lxml')
     #url1 = soup.select()
     #print(soup)
     titles = soup.select(
       'body > div.list-wrap > div.list-box > ul > li > div.des > h2 > a:nth-of-type(1)')
     #print(type(titles[0].spilt('\"')))
     #print(titles)
     houses = soup.select('body > div.list-wrap > div.list-box > ul > li > div.des > p.room')
     oneaddresss = soup.select(
         'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(1)')
     twoaddresss = soup.select(
        'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(2)')
     additions = soup.select(
         'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor')
     #print(additions[0])
     prices = soup.select(
        'body > div.list-wrap > div.list-box > ul > li > div.list-li-right > div.money > b')
     #print(houses[0])
     #print(type(str(titles[0].string).replace(' ', '').replace('\n', '')))
     '''
     for i in range(100):
         print(i)
         print(addition[0].get_text().split(' ')[i].replace(' ', '').replace("\n", ""))
     '''
     count = 1
     ssss = 5
     for title, house, oneaddress, twoaddress, addition, price in zip(titles, houses, oneaddresss, twoaddresss, additions, prices):

        data = [
           (
                str(title.string).replace(' ', '').replace('\n', ''),
                house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""),
                house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""),
                oneaddress.get_text().replace(' ', '').replace("\n", ""),
                twoaddress.get_text().replace(' ', '').replace("\n", ""),
                addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""),
                price.get_text().replace(' ', '').replace("\n", "")
            )
        ]
        sheet.write(count, 0, str(title.string).replace(' ', '').replace('\n', ''))
        sheet.write(count, 1, house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""))
        sheet.write(count, 2, house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""))
        sheet.write(count, 3, oneaddress.get_text().replace(' ', '').replace("\n", ""))
        sheet.write(count, 4, twoaddress.get_text().replace(' ', '').replace("\n", ""))
        sheet.write(count, 5, addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""))
        sheet.write(count, 6, price.get_text().replace(' ', '').replace("\n", ""))
        url1 = str(title).split('\"')[3]
        time.sleep(5)
        try:
            db_data1 = requests.get(url1, headers=headers)
        except:
            time.sleep(10)
            try:
                db_data1 = requests.get(url1, headers=headers)
            except:
                time.sleep(35)
                try:
                    db_data1 = requests.get(url1, headers=headers)
                except:
                    excel.save('filename.xls')
        soup1 = BeautifulSoup(db_data1.text, 'lxml')
        #print(soup1)
        #print(soup1)
        pos = soup1.select('body > div.houseInfo > ul >li')
        try:
            print(pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
        except:
            excel.save('filename.xls')
        sheet.write(count, 7, pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
        pos2 = soup1.select('body > div.configure > div.fang-detail > dl> dd> ul >li')
        print(pos2)
        stra = ''
        if len(pos2):
            for p in pos2:
                if (len(str(p)) < 25):
                    print(p.get_text().replace(' ', '').replace("\n", ""))
                    stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
                # print((str(p)))
                # print(len(str(p)))
            pos3 = soup1.select('body > div.configure > ul >li')
            for q in pos3:
                if (len(str(q)) < 25):
                    print(q.get_text().replace(' ', '').replace("\n", ""))
                    stra = stra + ',' + q.get_text().replace(' ', '').replace("\n", "")
            print(stra)
            sheet.write(count, 8, stra)
        else:
            pos2 = soup1.select('body > div.configure > ul >li')
            print(pos2)
            for p in pos2:
                if (len(str(p)) < 25):
                    print(p.get_text().replace(' ', '').replace("\n", ""))
                    print(len(str(p)))
                    stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
            print(stra)
            sheet.write(count, 8, stra)
        #print(type(house.get_text().split(' ')[0].replace(' ', '').replace("\n", "")))
        #print(data)
        #print("已经抓取" + str(count) + "条数据")
        count += 1
        print(count)
        time.sleep(ssss)
     excel.save('filename.xls')




if __name__ == '__main__':
       download("https://cd.58.com/chuzu/pn6/?PGTID=0d3090a7-0006-6d54-6db2-a757b630de63&ClickID=2")

由于58同城的反扒机制，使用自己的IP只能爬取几个数据就被封了，建议使用动态IP。
话不多说，直接上图。在这里插入图片描述

lxxswd

关注

2
点赞
踩
24

收藏

觉得还不错? 一键收藏
2
评论
python爬虫爬取58同城租房信息（使用动态IP）输出Excel文件

python爬虫爬取58同城信息（使用动态IP）新手，为了做一个数据分析，搞了几天，终于搞出来了，大家可以给点意见啊。# coding=utf-8import sysimport csvimport timefrom importlib import reloadimport xlwtimport requestsfrom bs4 import BeautifulSoupfrom goto import with_gotoreload(sys)# 请求头设置#使用win自带的浏览器
复制链接

扫一扫