import requests
from bs4 import BeautifulSoup
urls = ['http://bj.xiaozhu.com/search-duanzufang-p{}-0/'.format(str(i)) for i in range(1,15,1)]
def spider_prepare(urls):
spider_urls = []
for url in urls:
if len(spider_urls)
print(len(spider_urls))
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
links = soup.select('a[class="resule_img_a"]')
for link in links:
new_link = link.get('href')
spider_urls.append(new_link)
return spider_urls
# 查看是否抓取待爬取的url链接成功
print(spider_prepare(urls))
data = {}
for url in spider_prepare(urls):
print(url)
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
title = soup.select('div.pho_info > h4')[0].text
address = soup.select('div.con_l > div.pho_info > p')[0].get('title')
price = soup.select('#pricePart > div.day_l > span')[0].text
pic = soup.select('#curBigImage')[0].get('src')
name = soup.select('a.lorder_name')[0].text
name_pic = soup.select('div.member_pic > a > img')[0].get('src')
name_gender = soup.select('div.w_240 > h6 > span')[0].get('class')
# # 查看结果
# print(title)
# print(address)
# print(price)
# print(pic)
# print(name)
# print(name_pic)
# print(name_gender)
def gender(name_gender):
if name_gender[0] == "member_boy_ico":
return 'boy'
elif name_gender[0] =="member_girl_ico":
return 'girl'
else:
return '未知'
# 验证结果
# print(gender(name_gender))
data = {
'title':title,
'address':address,
'price':price,
'pic':pic,
'name':name,
'name_pic':name_pic,
'name_gender':gender(name_gender)
}
# 验证data
print(data)
with open(r'D:\python3\tripadvisor_spider\xiaozhu.txt','a+',encoding='utf-8') as file_text:
# data['title'].strip('\n') 去除字典title的\n符号
file_text.writelines('标题:{}\t地址:{}\t价格:{}\t照片:{}\t屋主姓名:{}\t屋主照片:{}\t屋主性别:{}\n'.\
format(data['title'].strip('\n'),data['address'],data['price'],data['pic'],data['name'],\
data['name_pic'],data['name_gender']))