包含秒杀进度、距离结束时间、当前时间、商品标题、翻译后的标题、品牌、品牌是否有先关的备案注册信息、ASIN、Date first listed on Amazon、star、review、rank
删除了较多注释, 复制后能不用随缘。
import csv
import json
import time
import redis
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from googletrans import Translator
# import requests.packages.urllib3.util.ssl_
# requests.packages.urllib3.util.ssl_.DEFAULT_CIPHERS = 'ALL'
def trademark(goods_brand):
if goods_brand == None:
brand_register = '未知'
return brand_register
if goods_brand == 'null':
brand_register = '未知'
return brand_register
r = redis.Redis(host='127.0.0.1', port=6379, db=0, decode_responses=True, password=XXXXX)
redis_brand = r.get(goods_brand)
if redis_brand == 'Registered':
return redis_brand
if redis_brand == None or redis_brand == "unregistered":
ff_option = Options()
ff_option.add_argument('-headless')
browser = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', options=ff_option)
browser.get('http://tmsearch.uspto.gov')
browser.find_element_by_xpath('/html/body/center/table[1]/tbody/tr[2]/td/font/font/a').click()
# 向文本框中传入待查询的品牌
browser.find_element_by_name("p_s_PARA2").send_keys(goods_brand)
# 点击查询按钮
browser.find_element_by_xpath("//input[@οnclick='changeCurlyQuote();']").click()
# 找到查询商标结果页面的title
# 如果查询结果页的title为“TESS -- Error”则判定改商标未注册
# 已注册用True表示,未注册用False表示
register_html = browser.page_source
if len(register_html) > 920:
brand_register = 'Registered'
else:
brand_register = 'unregistered'
r.set(goods_brand, brand_register)
browser.quit()
return brand_register
def send_request(url, headers, proxies, session):
flag = False
while not flag:
try:
response = session.get(url, headers=headers, proxies=proxies, verify=False)
return response
except Exception as E:
print(E)
print('失败,正在重新尝试。')
continue
flag = True
def rank(goods_soup):
goods_rank_li = goods_soup.find('li', id='SalesRank')
if goods_rank_li:
goods_rank = goods_rank_li.text.strip().replace("\n", '').replace(
".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
"").replace("Amazon Best Sellers Rank:", '').replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", '\n#')
return goods_rank
else:
goods_rank_table = goods_soup.find('table', id='productDetails_detailBullets_sections1')
if goods_rank_table:
goods_rank_table_tr = goods_rank_table.find_all('tr')
for tr in goods_rank_table_tr:
if tr.find('th').text.strip().replace(" ", '') == 'BestSellersRank':
goods_rank = tr.find('td').text.replace("\n", '').replace(
".zg_hrsr { margin: 0; padding: 0; list-style-type: none; }.zg_hrsr_item { margin: 0 0 0 10px; }.zg_hrsr_rank { display: inline-block; width: 80px; text-align: right; }",
"").replace("Amazon Best Sellers Rank:", '').replace("Amazon Bestsellers Rank: ", "").replace("Best Sellers Rank", "").strip().replace("#", '\n#')
return goods_rank
def title(goods_soup):
goods_title_span = goods_soup.find('span', id='productTitle')
if goods_title_span:
goods_title = goods_title_span.text.strip().replace("\xa0", '').replace(",", '')
return goods_title
else:
goods_title = 'null'
return goods_title
def brand(goods_soup):
goods_brand_a = goods_soup.find('a', id='bylineInfo')
if goods_brand_a:
goods_brand = goods_brand_a.text.strip().replace("\xa0", '')
return goods_brand
else:
try:
goods_brand = goods_soup.find('a', id='brand').text.strip().replace("\xa0", '')
return goods_brand
except AttributeError:
goods_brand = 'null'
return goods_brand
def star(goods_soup):
goods_star_span = goods_soup.find('span', id='acrPopover')
if goods_star_span:
goods_star = goods_star_span.find('i').text.split(" out of 5 stars")[0]
return goods_star
else:
goods_star = 'null'
return goods_star
def review(goods_soup):
goods_review_sapn = goods_soup.find('span', id='acrCustomerReviewText')
if goods_review_sapn:
goods_review = goods_review_sapn.text.split(" customer reviews")[0]
return goods_review
else:
goods_review = 'null'
return goods_review
def price(goods_soup):
goods_price_span = goods_soup.find('span', id='priceblock_dealprice')
if goods_price_span:
goods_price = goods_price_span.text.replace("$", '')
return goods_price
else:
goods_price_span = goods_soup.find('span', id='newBuyBoxPrice')
if goods_price_span:
goods_price = goods_price_span.text.replace("$", '')
return goods_price
else:
goods_price_sale = goods_soup.find('span', id='priceblock_saleprice')
if goods_price_sale:
goods_price = goods_price_sale.text.replace("$", '')
return goods_price
else:
try:
goods_price = goods_soup.find('span', id='priceblock_ourprice').text.replace("$", '')
return goods_price
except AttributeError:
goods_price = 'null'
return goods_price
def date(goods_soup):
goods_date_div = goods_soup.find('div', id='detailBullets_feature_div')
if goods_date_div:
goods_date_all_li = goods_date_div.find_all('li')
for li in goods_date_all_li:
li_title_span = li.find('span', class_='a-text-bold')
if li_title_span:
li_title = li_title_span.text.strip()
if li_title == 'Date first listed on Amazon:':
goods_date = li.text.strip().replace("Date first listed on Amazon:", '').strip()
return goods_date
else:
goods_date_table = goods_soup.find('table', id='productDetails_detailBullets_sections1')
if goods_date_table:
goods_date_table_tr = goods_date_table.find_all('tr')
for tr in goods_date_table_tr:
if tr.find('th').text.strip().replace(" ", '') == 'DatefirstlistedonAmazon':
goods_date = tr.find('td').text.replace("\n", '').strip()
return goods_date
def translator(title):
translator = Translator(service_urls=['translate.google.cn'])
after_title = translator.translate('%s' % title, src='de', dest="zh-CN")
translation_title = str(after_title).split("text=")[1].replace(", pronunciation=None)", '')
return translation_title
def run(page):
print("当前页码为:%s" % page)
# url = 'http://webapi.http.zhimacangku.com/getip?num=1&type=1&pro=&city=0&yys=0&port=11&time=1&ts=0&ys=0&cs=0&lb=4&sb=0&pb=4&mr=1®ions='
# proxy = requests.get(url).text.replace("\n", '')
# proxies = {
# "http": "http://" + proxy,
# 'https': 'https://' + proxy
# }
base_url = 'https://www.amazon.com/dp/'
headers = {
'accept-encoding': 'gzip, deflate, br',
# 'accept-language': 'zh-CN,zh;q=0.9',
'upgrade-insecure-requests': '1',
'user-agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36",
'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
'cache-control': 'max-age=0',
'authority': 'www.amazon.com',
# 'cookie': 'session-id=140-0436092-5114916; session-id-time=2082787201l; ubid-main=134-7819260-0954509; x-wl-uid=1D2qwOkkELPrF8q/YqteGc9JYBV5ARtF3Mc2jW/s32idPGnjnZ7sbB5wHyMnR/u9Sw34fPkbO2xA=; session-token=1c+rOmuhW1M6euftwY4+w/swVPHn3AudZXzHSqu/xcF4uMyS946ZcjCM2If+kp/T4sOC1KoBEXBODEkbBGmd9AbT7XagZlu0xFW9tti1p/z0xCUCzs5/GOrqAi7knU6259ewXpjCRgotqeM8IgNTqV1AXCu/yCq/9abumda60iIUqCTSnaleSMeEU1l25LK0Y4FnXzmTNEgJJLvLtbgLF6Hnw7uyO2qTA2xB7uIB/ZZlC+TvrnOtd3cGa7jV6MHc; s_nr=1526623968134-New; s_vnum=1958623968135^%^26vn^%^3D1; s_dslv=1526623968135; lc-main=en_US; x-amz-captcha-1=1527334802791799; x-amz-captcha-2=kU9AAR92z09BFOUgsoQgXw==; skin=noskin; csm-hit=tb:SDXEVFJG2SJZZRQZDRAF+s-SDXEVFJG2SJZZRQZDRAF^|1527472665005^&adb:adblk_no',
}
ff_option = Options()
ff_option.add_argument('-headless')
browser = webdriver.Chrome('C:\Program Files (x86)\Google\Chrome\Application\chromedriver.exe', chrome_options=ff_option)
browser.get('https://www.amazon.com/gp/goldbox/ref=gbps_ftr_s-4_d724_page_' + str(
page) + '?gb_f_deals1=dealStates:AVAILABLE%252CWAITLIST%252CWAITLISTFULL%252CEXPIRED%252CSOLDOUT%252CUPCOMING,page:' + str(page) + ',dealTypes:LIGHTNING_DEAL,dealsPerPage:48')
time.sleep(10)
pageSource = browser.page_source
page_soup = BeautifulSoup(pageSource, 'lxml')
all_goods_div = page_soup.find('div', id='widgetContent').find_all('div', class_='a-section dealContainer')
print(len(all_goods_div))
for goods_div in all_goods_div:
session = requests.session()
# 给每个商品生成一个空列表
goods_info_list = []
schedule_div = goods_div.find('div', 'a-column a-span5 a-text-left unitLineHeight')
# 进度条
if schedule_div:
schedule = schedule_div.find('div', 'a-row unitLineHeight').text.strip().replace(" Claimed", '').replace("\xa0", '').replace("\xae", '').replace("\u2122", '')
else:
schedule = 'null'
# 距离结束
timer = goods_div.find('span', role='timer')
if timer:
end_time = timer.text.strip().replace("\xa0", '').replace("\xae", '').replace("\u2122", '')
else:
end_time = 'null'
now_time = time.strftime('%H:%M:%S', time.localtime(time.time()))
dealtitle = goods_div.find('a', id='dealImage')
# 取到商品详情页url
try:
goods_asin = dealtitle['href'].split("dp/")[1].split('/')[0]
except IndexError:
continue
goods_url = base_url + goods_asin
goods_html = requests.get(goods_url, headers=headers)
goods_soup = BeautifulSoup(goods_html.text, 'lxml')
print("商品链接为:" + goods_url)
goods_asin = goods_url.split('dp/')[1]
goods_title = title(goods_soup)
after_title = translator(goods_title)
goods_brand = brand(goods_soup)
goods_star = star(goods_soup)
goods_review = review(goods_soup)
goods_price = price(goods_soup)
goods_rank = rank(goods_soup)
goods_date = date(goods_soup)
brand_register = trademark(goods_brand)
if goods_date == None:
goods_date = 'null'
print("schedule:" + schedule)
print("goods_title:" + goods_title)
print("after_title:" + after_title)
print("goods_asin:" + goods_asin)
print("goods_brand:" + goods_brand)
print("brand_register:" + brand_register)
print("goods_date:" + str(goods_date))
print("goods_star:" + goods_star)
print("end_time:" + end_time)
print("now_time:" + now_time)
print("goods_review:" + goods_review)
print("goods_rank:" + str(goods_rank))
print("goods_price:" + goods_price)
goods_info_list.append(schedule)
goods_info_list.append(end_time)
goods_info_list.append(now_time)
goods_info_list.append(goods_title)
goods_info_list.append(after_title)
goods_info_list.append(goods_brand)
goods_info_list.append(brand_register)
goods_info_list.append(goods_asin)
goods_info_list.append(goods_date)
goods_info_list.append(goods_star)
goods_info_list.append(goods_review)
goods_info_list.append(goods_rank)
goods_info_list.append(goods_price)
print('=========================================')
csvFile = open('./Lightning_Deals_US/Lightning_Deals_US_%s.csv' % str(page), 'a', newline='', encoding='gb18030') # 设置newline,否则两行之间会空一行
writer = csv.writer(csvFile)
writer.writerow(goods_info_list)
csvFile.close()
if __name__ == '__main__':
for i in range(1, 22):
run(i)