十一个爆火的Python爬虫实战项目源码【不用谢】

本文涵盖了多个Python爬虫项目,包括豆瓣电影演员图片抓取、斗鱼弹幕信息存储、喜马拉雅音频下载、实习僧招聘信息抓取、花瓣网图片批量保存等。通过这些实例,深入探讨了XPath、Selenium和BeautifulSoup等工具的使用,以及数据存储至CSV和MongoDB的方法。
摘要由CSDN通过智能技术生成

目录

Python爬虫:爬取豆瓣电影中速度与激情8演员图片

Python爬虫:斗鱼弹幕相关信息保存到mongodb

Python爬虫:抓取喜马拉雅电台音频

Python爬虫—抓包分析爬取实习僧全部招聘信息

Python爬虫:批量抓取花瓣网高清美图并保存

Python爬虫:爬取v2ex数据用csv保存

Python爬虫:豌豆荚设计奖三种爬取方法速度对比

Python爬虫:使用lxml解析HTML,输出对应值

Python爬虫:使用Selenium爬取一点资讯动态数据

Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb

Python爬虫:获取黑大验证码并登录


Python爬虫:爬取豆瓣电影中速度与激情8演员图片

import urllib.request
import os
import re


def douban(url):
    r = urllib.request.urlopen(url)
    html = r.read().decode('utf-8')
    result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
    result2 = re.findall(r'(?<=title=").\S+', html)
    result2.pop()
    result3 = sorted(set(result2), key=result2.index)
    result3.pop(-3)
    if not os.path.exists('douban'):
        os.makedirs('douban')
    i = 0
    for link in result:
        filename = 'douban\\' + str(result3[i]) + '.jpg'
        i += 1
        with open(filename, 'w') as file:
            urllib.request.urlretrieve(link, filename)


url = 'https://movie.douban.com/subject/26260853/celebrities'
if __name__ == '__main__':
    douban(url)

Python爬虫:斗鱼弹幕相关信息保存到mongodb

# 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
__author__ = '布咯咯_rieuse'
__time__ = '2017.6.2'
__github__ = 'https://github.com/rieuse'

import multiprocessing
import re
import socket
import time

import pymongo
import requests
from bs4 import BeautifulSoup

clients = pymongo.MongoClient('localhost')
db = clients["DouyuTV_danmu"]
col = db["info"]

client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host = socket.gethostbyname("openbarrage.douyutv.com")
port = 8601
client.connect((host, port))

danmu_path = re.compile(b'txt@=(.+?)/cid@')
uid_path = re.compile(b'uid@=(.+?)/nn@')
nickname_path = re.compile(b'nn@=(.+?)/txt@')
level_path = re.compile(b'level@=([1-9][0-9]?)/sahf')


def sendmsg(msgstr):
    msg = msgstr.encode('utf-8')
    data_length = len(msg) + 8
    code = 689
    msgHead = int.to_bytes(data_length, 4, 'little') \
              + int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
    client.send(msgHead)
    sent = 0
    while sent < len(msg):
        tn = client.send(msg[sent:])
        sent = sent + tn


def start(roomid):
    msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
    sendmsg(msg)
    msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
    sendmsg(msg_more)

    print('---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
    while True:
        data = client.recv(1024)
        uid_more = uid_path.findall(data)
        nickname_more = nickname_path.findall(data)
        level_more = level_path.findall(data)
        danmu_more = danmu_path.findall(data)
        if not level_more:
            level_more = b'0'
        if not data:
            break
        else:
            for i in range(0, len(danmu_more)):
                try:
                    product = {
                        'uid': uid_more[0].decode(encoding='utf-8'),
                        'nickname': nickname_more[0].decode(encoding='utf-8'),
                        'level': level_more[0].decode(encoding='utf-8'),
                        'danmu': danmu_more[0].decode(encoding='utf-8')
                    }
                    print(product)
                    col.insert(product)
                    print('成功导入mongodb')
                except Exception as e:
                    print(e)


def keeplive():
    while True:
        msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
        sendmsg(msg)
        time.sleep(15)


def get_name(roomid):
    r = requests.get("http://www.douyu.com/" + roomid)
    soup = BeautifulSoup(r.text, 'lxml')
    return soup.find('a', {'class', 'zb-name'}).string


if __name__ == '__main__':
    room_id = input('请出入房间ID: ')
    p1 = multiprocessing.Process(target=start, args=(room_id,))
    p2 = multiprocessing.Process(target=keeplive)
    p1.start()
    p2.start()

Python爬虫:抓取喜马拉雅电台音频

_author__ = '布咯咯_rieuse'

import json
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
from lxml import etree

clients = pymongo.MongoClient('localhost')
db = clients["XiMaLaYa"]
col1 = db["album2"]
col2 = db["detaile2"]

UA_LIST = [
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
    "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
    "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
    "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
    "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
    "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, lik
  • 24
    点赞
  • 162
    收藏
    觉得还不错? 一键收藏
  • 13
    评论
评论 13
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值