python爬虫爬取58同城信息(使用动态IP)
新手,为了做一个数据分析,搞了几天,终于搞出来了,大家可以给点意见啊。
# coding=utf-8
import sys
import csv
import time
from importlib import reload
import xlwt
import requests
from bs4 import BeautifulSoup
from goto import with_goto
reload(sys)
# 请求头设置
#使用win自带的浏览器的agent
User_Agent = 'your-agent'
headers = {
'User-Agent': User_Agent,
}
def download(url):
File = open('filename', 'w')
excel = xlwt.Workbook(encoding='utf-8')
sheet = excel.add_sheet('shee1')
sheet.write(0, 0, '标题')
sheet.write(0, 1, '房子')
sheet.write(0, 2, '大小')
sheet.write(0, 3, '地址1')
sheet.write(0, 4, '地址2')
sheet.write(0, 5, '其他')
sheet.write(0, 6, '价格')
sheet.write(0, 7, '地区')
sheet.write(0, 8, '其他2')
db_data = requests.get(url)
soup = BeautifulSoup(db_data.text, 'lxml')
#url1 = soup.select()
#print(soup)
titles = soup.select(
'body > div.list-wrap > div.list-box > ul > li > div.des > h2 > a:nth-of-type(1)')
#print(type(titles[0].spilt('\"')))
#print(titles)
houses = soup.select('body > div.list-wrap > div.list-box > ul > li > div.des > p.room')
oneaddresss = soup.select(
'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(1)')
twoaddresss = soup.select(
'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor > a:nth-of-type(2)')
additions = soup.select(
'body > div.list-wrap > div.list-box > ul > li > div.des > p.infor')
#print(additions[0])
prices = soup.select(
'body > div.list-wrap > div.list-box > ul > li > div.list-li-right > div.money > b')
#print(houses[0])
#print(type(str(titles[0].string).replace(' ', '').replace('\n', '')))
'''
for i in range(100):
print(i)
print(addition[0].get_text().split(' ')[i].replace(' ', '').replace("\n", ""))
'''
count = 1
ssss = 5
for title, house, oneaddress, twoaddress, addition, price in zip(titles, houses, oneaddresss, twoaddresss, additions, prices):
data = [
(
str(title.string).replace(' ', '').replace('\n', ''),
house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""),
house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""),
oneaddress.get_text().replace(' ', '').replace("\n", ""),
twoaddress.get_text().replace(' ', '').replace("\n", ""),
addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""),
price.get_text().replace(' ', '').replace("\n", "")
)
]
sheet.write(count, 0, str(title.string).replace(' ', '').replace('\n', ''))
sheet.write(count, 1, house.get_text().split(' ')[0].replace(' ', '').replace("\n", ""))
sheet.write(count, 2, house.get_text().split(' ')[20].replace(' ', '').replace("\n", "").replace("\xa0", ""))
sheet.write(count, 3, oneaddress.get_text().replace(' ', '').replace("\n", ""))
sheet.write(count, 4, twoaddress.get_text().replace(' ', '').replace("\n", ""))
sheet.write(count, 5, addition.get_text().split(' ')[64].replace(' ', '').replace("\n", ""))
sheet.write(count, 6, price.get_text().replace(' ', '').replace("\n", ""))
url1 = str(title).split('\"')[3]
time.sleep(5)
try:
db_data1 = requests.get(url1, headers=headers)
except:
time.sleep(10)
try:
db_data1 = requests.get(url1, headers=headers)
except:
time.sleep(35)
try:
db_data1 = requests.get(url1, headers=headers)
except:
excel.save('filename.xls')
soup1 = BeautifulSoup(db_data1.text, 'lxml')
#print(soup1)
#print(soup1)
pos = soup1.select('body > div.houseInfo > ul >li')
try:
print(pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
except:
excel.save('filename.xls')
sheet.write(count, 7, pos[2].get_text().split(' ')[28].replace(' ', '').replace("\n", ""))
pos2 = soup1.select('body > div.configure > div.fang-detail > dl> dd> ul >li')
print(pos2)
stra = ''
if len(pos2):
for p in pos2:
if (len(str(p)) < 25):
print(p.get_text().replace(' ', '').replace("\n", ""))
stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
# print((str(p)))
# print(len(str(p)))
pos3 = soup1.select('body > div.configure > ul >li')
for q in pos3:
if (len(str(q)) < 25):
print(q.get_text().replace(' ', '').replace("\n", ""))
stra = stra + ',' + q.get_text().replace(' ', '').replace("\n", "")
print(stra)
sheet.write(count, 8, stra)
else:
pos2 = soup1.select('body > div.configure > ul >li')
print(pos2)
for p in pos2:
if (len(str(p)) < 25):
print(p.get_text().replace(' ', '').replace("\n", ""))
print(len(str(p)))
stra = stra + ',' + p.get_text().replace(' ', '').replace("\n", "")
print(stra)
sheet.write(count, 8, stra)
#print(type(house.get_text().split(' ')[0].replace(' ', '').replace("\n", "")))
#print(data)
#print("已经抓取" + str(count) + "条数据")
count += 1
print(count)
time.sleep(ssss)
excel.save('filename.xls')
if __name__ == '__main__':
download("https://cd.58.com/chuzu/pn6/?PGTID=0d3090a7-0006-6d54-6db2-a757b630de63&ClickID=2")
由于58同城的反扒机制,使用自己的IP只能爬取几个数据就被封了,建议使用动态IP。
话不多说,直接上图。