爬京东部分数据,python2.7,beautifulsoup,代码奉上

#_*_coding=utf-8 _*_
#__author__ = 'Administrator'


from selenium import webdriver
from bs4 import BeautifulSoup
import sys
import time
from compiler.ast import flatten
import requests
import re
import smtplib
from email.mime.text import MIMEText
from email.header import Header
import email.mime.multipart
from email.MIMEMultipart import MIMEMultipart
from email.MIMEBase import MIMEBase
from email import Encoders
from openpyxl import Workbook
reload(sys)
sys.setdefaultencoding('utf-8')
def getHTMLText(url):
    try:
        user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
                     'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' #模拟浏览器登陆
        headers = {'User-Agent': user_agent}
        r = requests.get(url, headers = headers)
        r.raise_for_status()
        r.encoding = r.apparent_encoding
        return r.text
    except:
        for i in range(5):
            time.sleep(2)
            try:
                user_agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)' \
                     'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36' #模拟浏览器登陆
                headers = {'User-Agent': user_agent}
                r = requests.get(url, headers = headers)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                return r.text
                break
            except:
                pass
deiver =webdriver.Firefox()
deiver.get('https://fresh.jd.com/')
deiver.find_element_by_xpath('/html/body/div[4]/div/div[1]/div[2]/div[1]/div[1]/div[2]/div[1]/div[3]/div/a').click()#需要手动输入div[1]/div/a')  ,第一个div值
windows = deiver.window_handles
deiver.switch_to.window(windows[-1]) #点击进入商品列表界面
deiver.switch_to.window(windows[0])
deiver.close()
deiver.switch_to.window(windows[-1])
time.sleep(2)
pages = deiver.find_element_by_xpath('/html/body/div[7]/div[1]/div[1]/div[1]/div/span').text #获取一共有多少页商品
pages = pages.encode("utf-8")
pages = int(pages)
page = pages/60 + 1
all_goods = []
all_url_goods = []#所有商品的url
for aa in xrange(1,page):
    a = 'https://list.jd.com/list.html?cat=12218,13581&page='#*****需要手动输入cat=12218,12221,cat后值
    b = '&sort=sort_rank_asc&trans=1&JL=6_0_0#J_main'
    bb = aa
    x = '%s%d%s'%(a,bb,b)
    try:
        html = getHTMLText(x)
        soup = BeautifulSoup(html,'lxml')
    except Exception as e:
        pass
    listt = soup.select('div[class="p-name"]')
    new_list1 = []
    name = []
    for i in xrange(len(listt)):
        b = re.findall('">\n<a href="(.*)'" target=",str(listt[i]))[0].decode()#匹配商品列表页的url
        new_list1.append(b)#匹配获取商品列表页一页的60个url地址,并添加到new_list1下
    print('第%s页'%aa)
    all_url_goods.append(new_list1)
#print('url获取结束,开始获取规格名称')
all_url_goods = flatten(all_url_goods)
#获取page页数,并将所有页数下的url添加到all_url_goods
print(all_url_goods)
name = []
goods_url = []
for i in xrange(len(all_url_goods)):
    print('第%s个商品'%i)
    x = 'http:'
    i = all_url_goods[i].replace('"','')
    xx = '%s%s'%(x,i)
    try:
        html =getHTMLText(xx)
        soup = BeautifulSoup(html,'lxml')
    except Exception as e:
        pass
    list1 = soup.findAll(attrs={'data-sku':True})

    for j in xrange(len(list1)):
        a = re.findall('data-sku="(.*)" data-value="',str(list1[j]))#匹配sku
        goods_url.append(a)
        goods_url=flatten(goods_url)
print('结束goods_url')
print(goods_url)
#goods_url =
name =[]
last_list = []
for i in xrange(len(goods_url)):
    a = 'https://item.jd.com/'
    b =int(goods_url[i])
    c = '.html'
    last_url = '%s%d%s'%(a,b,c)
    try:
        time.sleep(2)
        html = getHTMLText(last_url)
        soup = BeautifulSoup(html,'lxml')
    except Exception as e:
        pass
    last_list1 =soup.select('div[class="sku-name"]')#匹配名称
    try:
        name.append(last_list1[0].text)
        print('第%d个商品'%i)
    except Exception as e:
        pass
        print('第%d个商品'%i)
name=flatten(name)
name = list(zip(name,name))
def wd_aa(name):
    name = flatten(name)
    name = [i for i in name if i !=None ]
    name = list(zip(name,name))
    wb = Workbook('roulei.xlsx')
    ws = wb.create_sheet('sheet1')
    #print (name)
    for i in xrange(len(name)):
        ws.append(name[i])
    wb.save('roulei.xlsx')
wd_aa(name)
print '结束'
def emaile(fileurl,filename):
    #构造纯文本邮件内容
    body = MIMEText('新近我寂寞浓烈如酒,夕阳下多姿的垂柳,亦让人不知其是长袖善舞还是骚在其骨。我紧挥衣袖,想不带走一片云彩,却还是带走了你的容颜。','plain','utf-8')
    msg =MIMEMultipart()
    #发件人邮箱地址
    sender = '809647468@qq.com'
    user = '809647468'
    password = 'zlmjeoeivdqfbecj'
    #smtp服务器地址
    smtpserver = 'smtp.qq.com'
    #接收者邮箱地址
    receiver = "809647468@qq.com"
    msg['from']=sender
    msg['to']=receiver
    subject = 'chubby superman salutes you'
    msg['subject']=Header(subject,'utf-8')
    part =MIMEBase('application','octet-stream')
    part.set_payload(open(r'%s'%fileurl,'rb').read())
    Encoders.encode_base64(part)
    part.add_header('Content-Disposition', 'attachment; filename="%s"'%filename)
    msg.attach(part)
    smtp = smtplib.SMTP()
    smtp.connect(smtpserver,25)
    smtp.login(user,password)
    smtp.sendmail(sender,receiver,msg.as_string())
    smtp.quit()
emaile('E:\untitled1\wanwan\roulei.xlsx','roulei.xlsx')

望各位指教

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值