【爬虫】BeautifulSoup解析网页

最新推荐文章于 2024-08-30 10:01:19 发布

sisteryaya

最新推荐文章于 2024-08-30 10:01:19 发布

阅读量599

点赞数

本文链接：https://blog.csdn.net/sisteryaya/article/details/77887701

版权

用 BeautifulSoup 解析网页
# 获取 天猫 商品参数
import requests
import bs4
from bs4 import BeautifulSoup
import re
from collections import OrderedDict   # 有序字典

info = OrderedDict()  # 存放该商品所具有的全部信息
url = "https://detail.tmall.com/item.htm?spm=a230r.1.14.244.76bf523JAMdC7&id=544054883720&ns=1&abbucket=10"
res = requests.get(url)
soup = BeautifulSoup(res.text, "html.parser")
attrs = soup.select('#J_AttrUL li')  # 天猫 id
print('attrs=', attrs)  # attrs= [<li title=" 薄">厚薄: 薄</li>, <li title=" 其他100%">材质成分: 其他100%</li>,<li ...</li>]

attrs_name = []
attrs_value = []
for attr in attrs:
    attrs_name.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(1))
    attrs_value.append(re.search(r'(.*?):[\s]*(.*)', attr.text).group(2))

print('attrs_name=',attrs_name)   # attrs_name= ['厚薄', '材质成分', ...]
print('attrs_value=',attrs_value)   # attrs_value= ['薄', '其他100%', ...]

allattrs = OrderedDict()  # 存放该产品详情页面所具有的属性
for k in range(0, len(attrs_name)):
    allattrs[attrs_name[k]] = attrs_value[k]

print('allattrs=', allattrs)  # allattrs= OrderedDict([('厚薄', '薄'), ('材质成分', '其他100%'),...])

if '材质成分' in attrs_name:
    info['材质成分'] = allattrs['材质成分']
elif '面料' in attrs_name:
    info['材质成分'] = allattrs['面料']
else:
    info['材质成分'] = 'NA'
print(info)

'''
[\s] 匹配空格，[\s]*,后面有 *，则可以为空
* : 匹配前面的子表达式任意次
'''

# 淘宝女装按销量排序列表
from bs4 import BeautifulSoup
import urllib
import xlsxwriter
from selenium import webdriver
import requests

headers = { "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36" }

def get_good_url(word):
    url_str = urllib.parse.quote(word)  # 编码后的关键字
    j = 1
    for i in range(1,200,2):  # 前 100 页
        yield{
            'url' :('https://search.jd.com/Search?keyword={}&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&psort=3&stock=1&page={}&s={}&click=0'.format(url_str,i,j))
                }
        j+=60

def get_html(url):
    html = requests.get(url, headers=headers)
    html.encoding = html.apparent_encoding
    soup = BeautifulSoup(html.text, 'lxml')
    # print(soup)  # all code
    return soup


def get_info(soup,good):
    titles = soup.find_all(class_="p-name p-name-type-2")
    # print(all_titles)
    prices = soup.find_all(class_="p-price")
    # print(prices)

     #[<div class="p-price">
    # <strong class="J_10198901480" data-price="75.00"><em>¥</em><i>75.00</i></strong> </div>,

    commits = soup.find_all(class_="p-commit")
    # print(commits)
    #<div class="p-commit">
    # <strong>已有<a href="//item.jd.com/10785388229.html#comment" id="J_comment_10785388229" οnclick="searchlog(1,10785388229,29,3,'','flagsClk=1614807176')" target="_blank">2400+</a>条评价</strong>
    # </div>]
    imgs = soup.find_all(class_="p-img")
    # print(imgs)
    #<div class="p-img"><a href=" "target="_blank">

    for title, price, commit, img in zip(titles, prices, commits, imgs):
        data = {
            'title': title.text.strip(),
            # 'price': price.text.strip(),  # 'price': '¥75.00'
            'price': price.text.strip()[1:-3],  # 'price': '75'
            # 'commit': commit.text.strip(),  #  'commit': '已有4.7万+条评价
            'commit': get_commit(commit),   # 47000、10000
            'link': img.find_all('a')[0].get("href"),  # 商品链接
            'img': img.find_all('img')[0].get("src"),
        }
        # print(data)

if __name__=='__main__':
    good='绿茶'
    links = get_good_url(good)
    # print(links)   # generator
    for link in links:
        # print(type(link))  # <class 'dict'>
        url = link['url']
        # print(url)
        html = get_html(url)  # ok
        get_info(html, good)