爬取淘宝商品信息

import requests
import re
def getHTMLText(url):
    headers = {
        "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36",
        "cookie": "t=925dee34e81f413e4fef7a69c7f090c0; thw=cn; enc=USKVUh4tJthiI1eCoRjSvXMMwcQCUB6Pm%2FG2e%2Bx8Zzj2R8mvHbyIRMvjw5uHSAxRJqybVuPHcpheH4rViVghGA%3D%3D; ubn=p; ucn=center; hng=CN%7Czh-CN%7CCNY%7C156; _tb_token_=7eee31ea6eeee; _m_h5_tk=4d5d14490fdce5319f8541adfdadb480_1588050782216; _m_h5_tk_enc=d614cdbac7a97458b8e2c70f4cbcd729; cookie2=1557558a4332ef81858dc21b47817056; _samesite_flag_=true; mt=ci=0_0; cna=zqE4Ft8uf1YCAW4QakxC3scY; v=0; sgcookie=EelepVTQ%2F07Mt9LFQ%2BFwN; unb=3978845940; uc1=cookie16=URm48syIJ1yk0MX2J7mAAEhTuw%3D%3D&cookie15=URm48syIIVrSKA%3D%3D&cookie21=VT5L2FSpczFp&existShop=false&pas=0&cookie14=UoTUMtAXczzAIA%3D%3D; uc3=nk2=2%2FwbQB%2BJkrPffA%3D%3D&id2=UNkweIxWG1CnGg%3D%3D&vt3=F8dBxGRxWjf%2F%2BLsn9sU%3D&lg2=UIHiLt3xD8xYTw%3D%3D; csg=fc690fa1; lgc=%5Cu9AD8%5Cu6D69%5Cu8F691539; cookie17=UNkweIxWG1CnGg%3D%3D; dnk=%5Cu9AD8%5Cu6D69%5Cu8F691539; skt=344aaa20257e02a1; existShop=MTU4ODA0MjYyOA%3D%3D; uc4=id4=0%40Ug46tyo4ob3biEziEaxPUpGo6GU%2F&nk4=0%402ZJ%2Bt%2F9NMqURgG4XCISzJus7T6v1; tracknick=%5Cu9AD8%5Cu6D69%5Cu8F691539; _cc_=Vq8l%2BKCLiw%3D%3D; _l_g_=Ug%3D%3D; sg=90a; _nk_=%5Cu9AD8%5Cu6D69%5Cu8F691539; cookie1=UNk2H5M%2FwzJuY4rqcy9QEhX%2F39S6q%2B0QNDyKQ9qvKtI%3D; tfstk=c2dPBuwtzbhr1fGvB_CFPAV0q0ZRaaRH1YQdEdPsFzE3z0BlbsvYvZ0bv3KFXjXl.; isg=BLGxa6YHS-ekROcx-ophTXt2wD1LniUQ4sSRBZPGs3i3utAM2-yN4DHb3E7cL71I; l=eBIN-eicQJu_wU0GBOfwourza77O_IRjWuPzaNbMiT5POMfw9MiGWZjoJfYeCnGVHsOyR3yAaP04B7YO6ydSnxv9-M80ACMmndC.."}
    try:
        r=requests.get(url,headers=headers,timeout=30)
        r.raise_for_status()
        r.encoding=r.apparent_encoding
        return r.text
    except:
        return ""
def parsePage(ilt,html):
    try:
        plt=re.findall(r'\"view_price\"\:\"[\d\.]*\"',html)
        tlt=re.findall(r'\"raw_title\"\:\".*?\"',html)
        for i in range(len(plt)):
            price=eval(plt[i].split(":")[1])
            title=eval(tlt[i].split(":")[1])
            ilt.append([price,title])
    except:
        print("")
def printGoodsList(ilt):
    tplt = "{:4}\t{:8}\t{:16}"
    print(tplt.format("序号","价格","商品名称"))
    count=0
    for g in ilt:
        count=count+1
        print(tplt.format(count,g[0],g[1]))
    print("")
def main():
    goods="书包"
    depth=2
    start_url="https://s.taobao.com/search?q="+goods
    infoList=[]
    for i in range(depth):
        try:
            url=start_url +"&s="+str(44*i)
            html=getHTMLText(url)
            parsePage(infoList,html)
        except:
            continue
    printGoodsList(infoList)
main()
爬取淘宝商品信息一般分为以下步骤: 1. 打开淘宝搜索页面,输入关键词,获取搜索结果页面源代码。 2. 解析搜索结果页面,获取每个商品的链接地址。 3. 访问每个商品链接地址,获取商品详情页面源代码。 4. 解析商品详情页面,获取商品的基本信息和价格信息等。 5. 保存商品信息至本地文件或数据库。 下面是一个基于Python爬取淘宝商品信息的代码示例: ```python import requests from bs4 import BeautifulSoup keyword = '手机' # 搜索关键词 url = f'https://s.taobao.com/search?q={keyword}' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'} # 发送请求 response = requests.get(url, headers=headers) # 解析搜索结果页面 soup = BeautifulSoup(response.text, 'html.parser') items = soup.find_all('div', class_='item J_MouserOnverReq ') for item in items: # 获取每个商品的链接地址 link = 'https:' + item.find('a', class_='J_ClickStat')['href'] # 访问商品链接地址 details_response = requests.get(link, headers=headers) # 解析商品详情页面 details_soup = BeautifulSoup(details_response.text, 'html.parser') # 获取商品的基本信息和价格信息等 title = details_soup.find('h3', class_='tb-main-title').text.strip() price = details_soup.find('span', class_='tb-rmb-num').text print(title, price) ``` 需要注意的是,淘宝对爬虫有一定的反爬虫机制,为了避免被封IP,可以在发送请求时添加一些随机的等待时间,或者使用代理IP。另外,爬取淘宝商品信息也需要遵守相关法律法规,不得侵犯商家和消费者的权益。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值