from bs4 import BeautifulSoup
import time,requests,re
url = 'http://wx.58.com/pingbandiannao/25892738648911x.shtml'
wb_data = requests.get(url,headers=header)
soup =BeautifulSoup(wb_data.text,'lxml')
def get_links_from(who_sells):
urls = []
list_view = 'http://wx.58.com/pbdn/{}/pn2/'.format(str(who_sells))
wb_data = requests.get(list_view)
soup = BeautifulSoup(wb_data.text,'lxml')
for link in soup.select('td.t > a.t'):
url_one = link.get('href').split('?')[0]
if str('zhuanzhuan') not in url_one: #去掉不能被解析掉的转转网页
urls.append(url_one)
return urls
def get_view():
id = re.findall('http.*?nao/(.*?)x.shtml',url,re.S) #使用正则表达筛选ID
# print(id[0])
api = 'http://jst1.58.com/counter?infoid={}'.format(id[0])
js = requests.get(api)
views = js.text.split('=')[-1]
# print(views)
return views
def get_item_info(who_sells=0):
urls = get_links_from(who_sells)
for url in urls:
wb_data = requests.get(url)
soup = BeautifulSoup(wb_data.text,'lxml')
data = {
'title':soup.title.text,
'price':soup.select('div.su_con > span.c_f50')[0].text,
'date' :soup.select('li.time')[0].text,
'area' :list(soup.select('span.c_25d')[0].stripped_strings) if soup.find_all('span','c_25d') else None,#去掉区域为空掉选项,防止报错
'url' :url,
'cate' :'个人' if who_sells == 0 else '商家' ,
'views':get_view(),
}
print(data)
# get_item_info(url)
# get_links_from()
# get_view()
get_item_info()