爬取58上100页的某商品分类信息,代码如下:
from bs4 import BeautifulSoup
import requests
import time
import lxml
def get_links_from(who_sells,page=1):
urls = []
list_view = 'http://xa.58.com/pbdn/'+str(who_sells)+'/pn{}/'.format(str(page))
web_data = requests.get(list_view)
soup = BeautifulSoup(web_data.text,'lxml')
for link in soup.select('td.t > a[onclick]'):
urls.append(link.get('href').split('?')[0])
return urls
def get_item_info(who_sells=0,page=1):
urls=get_links_from(who_sells,page)
for url in urls:
web_data = requests.get(url)
time.sleep(1)
soup = BeautifulSoup(web_data.text,'lxml')
data = {
'title' : soup.select('div.box_left_top > h1')[0].text,
'price' : soup.select('div.price_li > span > i')[0].text,
'area' : soup.select('div.palce_li > span > i')[0].text,
'look_time' : soup.select('div.box_left_top > p > span.look_time')[0].text,
'want_person':soup.select('div.box_left_top > p > span.want_person')[0].text,
'cate' :'个人' if who_sells==0 else '商家'
}
print(data)
for page in range(1,101):
get_item_info(who_sells=0,page=page)
结果如图: