目录
Python爬虫:Selenium+xpath+bs4爬取亚马逊数据保存到mongodb
Python爬虫:爬取豆瓣电影中速度与激情8演员图片
import urllib.request
import os
import re
def douban(url):
r = urllib.request.urlopen(url)
html = r.read().decode('utf-8')
result = re.findall(r'https://img\d.doubanio.com/img/celebrity/medium/.*.jpg', html)
result2 = re.findall(r'(?<=title=").\S+', html)
result2.pop()
result3 = sorted(set(result2), key=result2.index)
result3.pop(-3)
if not os.path.exists('douban'):
os.makedirs('douban')
i = 0
for link in result:
filename = 'douban\\' + str(result3[i]) + '.jpg'
i += 1
with open(filename, 'w') as file:
urllib.request.urlretrieve(link, filename)
url = 'https://movie.douban.com/subject/26260853/celebrities'
if __name__ == '__main__':
douban(url)
Python爬虫:斗鱼弹幕相关信息保存到mongodb
# 这个抓取弹幕,然后把用户的uid,昵称,等级,弹幕内容都保存到mongodb中
__author__ = '布咯咯_rieuse'
__time__ = '2017.6.2'
__github__ = 'https://github.com/rieuse'
import multiprocessing
import re
import socket
import time
import pymongo
import requests
from bs4 import BeautifulSoup
clients = pymongo.MongoClient('localhost')
db = clients["DouyuTV_danmu"]
col = db["info"]
client = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
host = socket.gethostbyname("openbarrage.douyutv.com")
port = 8601
client.connect((host, port))
danmu_path = re.compile(b'txt@=(.+?)/cid@')
uid_path = re.compile(b'uid@=(.+?)/nn@')
nickname_path = re.compile(b'nn@=(.+?)/txt@')
level_path = re.compile(b'level@=([1-9][0-9]?)/sahf')
def sendmsg(msgstr):
msg = msgstr.encode('utf-8')
data_length = len(msg) + 8
code = 689
msgHead = int.to_bytes(data_length, 4, 'little') \
+ int.to_bytes(data_length, 4, 'little') + int.to_bytes(code, 4, 'little')
client.send(msgHead)
sent = 0
while sent < len(msg):
tn = client.send(msg[sent:])
sent = sent + tn
def start(roomid):
msg = 'type@=loginreq/username@=rieuse/password@=douyu/roomid@={}/\0'.format(roomid)
sendmsg(msg)
msg_more = 'type@=joingroup/rid@={}/gid@=-9999/\0'.format(roomid)
sendmsg(msg_more)
print('---------------欢迎连接到{}的直播间---------------'.format(get_name(roomid)))
while True:
data = client.recv(1024)
uid_more = uid_path.findall(data)
nickname_more = nickname_path.findall(data)
level_more = level_path.findall(data)
danmu_more = danmu_path.findall(data)
if not level_more:
level_more = b'0'
if not data:
break
else:
for i in range(0, len(danmu_more)):
try:
product = {
'uid': uid_more[0].decode(encoding='utf-8'),
'nickname': nickname_more[0].decode(encoding='utf-8'),
'level': level_more[0].decode(encoding='utf-8'),
'danmu': danmu_more[0].decode(encoding='utf-8')
}
print(product)
col.insert(product)
print('成功导入mongodb')
except Exception as e:
print(e)
def keeplive():
while True:
msg = 'type@=keeplive/tick@=' + str(int(time.time())) + '/\0'
sendmsg(msg)
time.sleep(15)
def get_name(roomid):
r = requests.get("http://www.douyu.com/" + roomid)
soup = BeautifulSoup(r.text, 'lxml')
return soup.find('a', {'class', 'zb-name'}).string
if __name__ == '__main__':
room_id = input('请出入房间ID: ')
p1 = multiprocessing.Process(target=start, args=(room_id,))
p2 = multiprocessing.Process(target=keeplive)
p1.start()
p2.start()
Python爬虫:抓取喜马拉雅电台音频
_author__ = '布咯咯_rieuse'
import json
import random
import time
import pymongo
import requests
from bs4 import BeautifulSoup
from lxml import etree
clients = pymongo.MongoClient('localhost')
db = clients["XiMaLaYa"]
col1 = db["album2"]
col2 = db["detaile2"]
UA_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, lik

本文涵盖了多个Python爬虫项目,包括豆瓣电影演员图片抓取、斗鱼弹幕信息存储、喜马拉雅音频下载、实习僧招聘信息抓取、花瓣网图片批量保存等。通过这些实例,深入探讨了XPath、Selenium和BeautifulSoup等工具的使用,以及数据存储至CSV和MongoDB的方法。
最低0.47元/天 解锁文章

2万+

被折叠的 条评论
为什么被折叠?



