用 BeautifulSoup 解析网页
# 获取 天猫 商品参数 import requests import bs4 from bs4 import BeautifulSoup import re from collections import OrderedDict # 有序字典 info = OrderedDict() # 存放该商品所具有的全部信息 url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.244.76bf523JAMdC7&id=544054883720&ns=1&abbucket=10" res = requests.get(url) soup = BeautifulSoup(res.text, "html.parser") attrs = soup.select('#J_AttrUL li') # 天猫 id print('attrs=', attrs) # attrs= [<li title=" 薄">厚薄: 薄</li>, <li title=" 其他100%">材质成分: 其他100%</li>,<li ...</li>] attrs_name = [] attrs_value = [] for attr in attrs: attrs_name.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(1)) attrs_value.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(2)) print('attrs_name=',attrs_name) # attrs_name= ['厚薄', '材质成分', ...] print('attrs_value=',attrs_value) # attrs_value= ['薄', '其他100%', ...] allattrs = OrderedDict() # 存放该产品详情页面所具有的属性 for k in range(0, len(attrs_name)): allattrs[attrs_name[k]] = attrs_value[k] print('allattrs=', allattrs) # allattrs= OrderedDict([('厚薄', '薄'), ('材质成分', '其他100%'),...]) if '材质成分' in attrs_name: info['材质成分'] = allattrs['材质成分'] elif '面料' in attrs_name: info['材质成分'] = allattrs['面料'] else: info['材质成分'] = 'NA' print(info) ''' [\s] 匹配空格,[\s]*,后面有 *,则可以为空 * : 匹配前面的子表达式任意次 '''
# 淘宝女装按销量排序列表 from bs4 import BeautifulSoup import urllib import xlsxwriter from selenium import webdriver import requests headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" } def get_good_url(word): url_str = urllib.parse.quote(word) # 编码后的关键字 j = 1 for i in range(1,200,2): # 前 100 页 yield{ 'url' :('https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&psort=3&stock=1&page={}&s={}&click=0'.format(url_str,i,j)) } j+=60 def get_html(url): html = requests.get(url, headers=headers) html.encoding = html.apparent_encoding soup = BeautifulSoup(html.text, 'lxml') # print(soup) # all code return soup def get_info(soup,good): titles = soup.find_all(class_="p-name p-name-type-2") # print(all_titles) prices = soup.find_all(class_="p-price") # print(prices) #[<div class="p-price"> # <strong class="J_10198901480" data-price="75.00"><em>¥</em><i>75.00</i></strong> </div>, commits = soup.find_all(class_="p-commit") # print(commits) #<div class="p-commit"> # <strong>已有<a href="//item.jd.com/10785388229.html#comment" id="J_comment_10785388229" οnclick="searchlog(1,10785388229,29,3,'','flagsClk=1614807176')" target="_blank">2400+</a>条评价</strong> # </div>] imgs = soup.find_all(class_="p-img") # print(imgs) #<div class="p-img"><a href=" "target="_blank"> for title, price, commit, img in zip(titles, prices, commits, imgs): data = { 'title': title.text.strip(), # 'price': price.text.strip(), # 'price': '¥75.00' 'price': price.text.strip()[1:-3], # 'price': '75' # 'commit': commit.text.strip(), # 'commit': '已有4.7万+条评价 'commit': get_commit(commit), # 47000、10000 'link': img.find_all('a')[0].get("href"), # 商品链接 'img': img.find_all('img')[0].get("src"), } # print(data) if __name__=='__main__': good='绿茶' links = get_good_url(good) # print(links) # generator for link in links: # print(type(link)) # <class 'dict'> url = link['url'] # print(url) html = get_html(url) # ok get_info(html, good)
【爬虫】BeautifulSoup解析网页
最新推荐文章于 2024-08-30 10:01:19 发布