店铺商品id爬取

import requests
from bs4 import  BeautifulSoup
import lxml
import re
import time
import random
import pymysql.cursors
connection = pymysql.connect(host='localhost',
                             user='root',
                             password='123',
                             db='asd',
                             charset='utf8mb4',
                             cursorclass=pymysql.cursors.DictCursor)
payload = {
    "Ancoding":"gzip, deflate, sdch, br",
"Accept-Language":"zh-CN,zh;q=0.8",
"Connection":"keep-alive",
"Cookie":"hng=; uss=UIMY14A%2B04Bbq%2BqRxS6C9OzJWudsw14Q1kb5mDDqxW%2BQ3YG%2BUcpgrDRWnRQ%3D; uc3=sg2=AC4AfXCJ7XkLw0gCUD1tD9ZxhXFdweN2A6VfybWadxI%3D&nk2=&id2=&lg2=; t=3c0787f77a28e0854ef28fc360b2c555; cookie2=1c912d33e44bdb2008763748702a61f4; _tb_token_=78577371d8136; l=AiQkmjyCyPnG7qTN1Iu5fBqvdCgWvUgn; isg=AvDwL_qYXdDeegACSXGXiIOKwb7f2NSDXgsSOepBvMsepZFPkkmkE0aNixo_; pnm_cku822=; cna=T7gREcWMLDsCAavWmjBJPJpS; Hm_lvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950; Hm_lpvt_c478afee593a872fd45cb9a0d7a9da3b=1495496950",
"Host":"tanggulake.tmall.com",
"Referer":"https://tanggulake.tmall.com/search.htm?spm=a220o.1000855.w5002-15900729481.1.b3kpys&search=y",
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
"X-Requested-With":"XMLHttpRequest"}

with connection.cursor() as cursor:
    # Create a new
    sql = "select * from 竞店"
    cursor.execute(sql)
    q = cursor.fetchall()
    # connection is not autocommit by default. So you must commit to save
    # your changes.
    connection.commit()



for i in q:
    url =i["地址"]
    url_re = requests.get(url+"1", params=payload)
    soup = BeautifulSoup(url_re.text, "lxml")
    pig=soup.select("div >  div > div > div > span:nth-of-type(1)")
    get_pig=(pig[2].text.split("/"))[1]
    print(get_pig)
    ids=[]
    for pij in range(1,int(get_pig)+1):
        time.sleep(random.randrange(1,5))
        ur1=i["地址"]+str(pij)
        url_re1=requests.get(ur1,params=payload)
        soup=BeautifulSoup(url_re1.text,"lxml")
        date = soup.select("div > div > div > dl")
        for spid in date:
            ids.append(re.sub("\D", "", spid.get("data-id")))

    with connection.cursor() as cursor:
            # Create a new
        sql = 'select id from'+" " +i["店铺名称"]
        cursor.execute(sql)
        q = cursor.fetchall()
        q = [i["id"] for i in q]
        for w in ids:
            if w not in q:

                sql = "INSERT INTO "+i["店铺名称"]+ "(`id`) VALUES (%s)"
                cursor.execute(sql, w)

                    # connection is not autocommit by default. So you must commit to save
                    # your changes.
    connection.commit()

 

转载于:https://www.cnblogs.com/gao-xiang/p/6949794.html

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值