Python获取个人网站的所有课程下载链接和密码,并保存到Mongodb中

1、获取网站课程的分类地址;

'''
爬取屌丝首页,获取每个分类名称和链接
'''

import requests
from lxml import etree

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
}
def get_class_data():
    list_data = []
    url = 'http://www.diaosiweb.net/index.html'
    responese = requests.get(url,headers=headers)
    responese.encoding = responese.apparent_encoding
    class_names = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/text()')
    class_links = etree.HTML(responese.text).xpath('//div[@id="menu"]/div/ul/li/a/@href')
    for class_name,class_link in zip(class_names,class_links):
        if len(class_link.split('/')[-1]) == 0:
            class_data = {
                '类别名称':class_name,
                '类别链接':class_link,
            }
            list_data.append(class_data)
        else:
            pass
    return list_data
View Code

2、通过上面获取的地址来获取所有的每个分类下的所有课程名称、链接和发布时间,并保存到Mongodb中去;

'''
获取每个分类url下面的课程名称和链接,然后通过课程链接,进入到链接里面去获取每个课程的url和密码
'''

from spiders_diaosi import get_class_data
import requests
from lxml import etree
import pymongo
from multiprocessing import Pool

headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
}
client = pymongo.MongoClient('localhost',27017)
diaosi = client['kecheng_message']
kecheng_message = diaosi['message']

def get_kecheng_data(url):      #获取每页的课程名称、链接、发布时间
    try:
        response = requests.get(url,headers = headers)
        response.encoding = response.apparent_encoding
        kecheng_names = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/text()')
        kecheng_links = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/a/@href')
        times = etree.HTML(response.text).xpath('//ul[@class="g-list1"]/li/span/text()')
        for kecheng_name,kecheng_link,time in zip(kecheng_names,kecheng_links,times):
            data = {
                '课程名称':kecheng_name,
                '课程链接':kecheng_link,
                '发布时间':time
            }
            kecheng_message.insert(data)        #把获取到的课程信息保存到Mongodb中,最后爬取的时候从数据中爬取
            #print(data)
    except Exception as e:
        print(e)

def get_max_page(url):      #获取每个分类的最大页数
    page_response = requests.get(url,headers=headers)
    page_num = int(etree.HTML(page_response.text).xpath('//span[@class="pageinfo"]/strong[1]/text()')[0])
    return page_num
    #print(page_num)

def get_class_id(url):
    class_response = requests.get(url,headers=headers)
    class_response.encoding = class_response.apparent_encoding
    if get_max_page(url) != 1:
        class_id = int(etree.HTML(class_response.text).xpath('//ul[@class="pagelist"]/li/a/@href')[-1].split('_')[1])
        for num in range(1,get_max_page(url) + 1):
            new_url = '{}list_{}_{}.html'.format(url,class_id,num)
            #print(new_url)
            get_kecheng_data(new_url)

    else:
        get_kecheng_data(url)

for link in get_class_data():       #从之前的爬取的分类链接中,读取其中的链接,然后爬取每个分类链接中的课程信息
    url = link['类别链接']
    print('开始爬取:' + link['类别名称'])
    get_class_id(url)
    print('已经爬完了:' + link['类别名称'])
View Code

3、从数据库中读取每个课程的链接,因为下载地址只有登入之后才可以看到,所以模拟登入之后,进行获取,并保存到Mongodb中去,

from get_captcha import get_capthca
import pymongo
import re
import requests
from lxml import etree
import random

client = pymongo.MongoClient('localhost',27017)
diaosi = client['kecheng_message']
kecheng_message = diaosi['message']
dow_message = diaosi['dow_message']

login_url = 'http://www.diaosiweb.net/member/index.php'
headers_data = [
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393',
    'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0',
]
headers = {'User-Agent':random.choice(headers_data)}
data = {
   'fmdo':'login',
   'dopost':'login',
   'gourl':'',
   'userid':'***',      #运行的时候这里输入你的用户名,或者用input函数输入也可以
   'pwd':'****',        #这里则输入密码,获取用input函数
   'vdcode':'',
   'keeptime':'604800',
}

get_capthca(login_url)
captcha = input('输入你看到的验证码:')
data['vdcode'] = captcha

session = requests.Session()
session.headers.update(headers)

login_response = session.get(login_url,headers= headers,data=data)
for link in kecheng_message.find():
    html = session.get(link['课程链接'])
    html.encoding = html.apparent_encoding
    dow_url = re.compile("<div id='pan' style=\"display:none;\">(.*?)</div>").findall(html.text)[0]
    mima = etree.HTML(html.text).xpath('//span[@style]/text()')
    data = {
        'name':link['课程名称'],
        'link':link['课程链接'],
        'dow_url':dow_url,
    }
    try:
        if len(mima) == 0  or len(mima) > 5 and '网盘提取密码' not in mima[-1].split(':') :
            data['mima'] = '没有密码'
        else:
            data['mima'] = mima
        dow_message.insert(data)
        print(data)
    except Exception as e:
        print(e)
        print(link['课程名称'])
View Code

下面是获取网页验证码的,

'''
获取登入界面的验证码,并保存到本地     --现在只是保存到本地中,后期再编写自动输入
'''


import requests
from lxml import etree
import os

login_url = 'http://www.diaosiweb.net/member/index.php'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.81 Safari/537.36',
}

def get_capthca(url):
    login_response = requests.get(url,headers=headers)
    image_url = 'http://www.diaosiweb.net' + etree.HTML(login_response.text).xpath('//img[@id="vdimgck"]/@src')[0].replace('..','')
    image_response = requests.get(image_url).content
    with open('captcha.jpg','wb') as f:
        f.write(image_response)
        f.close()
        print('验证码已经保存到:{}'.format(os.getcwd()))
View Code

恩,这样差不多就完成了一个爬虫项目了,因为是第一次完整的爬取,所以写的比较乱,也没有思维图,也知道有很多地方不完善,但是发懒筋了,不想写了,先这样吧!

转载于:https://www.cnblogs.com/114811yayi/p/6938948.html

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值