如在京东上瞧上一款平板,网页为 http://item.jd.com/1185294.html
要把里面的规格参数抽取出来,我们使用python3
首先导入一些模块
import json,urllib.request
from pandas import Series
from pandas import DataFrame
from bs4 import BeautifulSoup
抽取网页信息
response = urllib.request.urlopen('http://item.jd.com/1185294.html')
html = response.read()
soup = BeautifulSoup(html); divSoup = soup.find(id="product-detail-2") data = DataFrame(columns=['Feature', 'Property']) trs = divSoup.find_all('tr');for tr in trs: tds = tr.find_all('td') if len(tds)==2: f=tds[0].getText() p=tds[1].getText() data = data.append(Series([f, p], index=['Feature', 'Property']), ignore_index=True) print (data)