#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Aug 30 17:41:40 2021
@author: ledi
"""
import requests
from lxml import etree
import datetime
from bs4 import BeautifulSoup
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1"
}
# 设置保存路径 保存到指定文件夹 路径复制过来
# path = input("请输入保存路径:")
path='./'
start_time = datetime.datetime.now()
url='https://www.douban.com/group/707669/'
# url='https://www.douban.com/group/707669/discussion?start=50'
# 查找所有class属性为hd的div标签
data={'channel': 'notification','user':171179645,
'auth': '171179645_1630319952:4c75d6c238952c300674a7ac028cf7ad2ba527ce'}
html= requests.get(url, headers=headers).text
html= requests.get(url, headers=headers,data=data).text
# soup = BeautifulSoup(html, "lxml")
soup = BeautifulSoup(html, "lxml")
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
# soup = BeautifulSoup(html, "lxml")
# 查找所有class属性为hd的div标签
div_list = soup.find_all('td', class_='title')
# //*[@id="topic-content"]/div[2]/h3/span[2]
#topic-content > div.topic-doc > h3 > span.create-time.color-green
import time
data=[]
for k in div_list:
# print(k)
c=str(k.a).split()
print(len(c))
print(c)
c1=c[2].split('=')
c2=c1[1].split('"')
temp=[c2[1],c[-2]]
temp_html= requests.get(temp[0], headers=headers).text
et_html = etree.HTML(temp_html)
# 查找所有class属性为hd的div标签下的a标签的第一个span标签
urls = et_html.xpath("""//*[@id="topic-content"]/div[2]/h3/span[2]""")
this_time=[each.text.strip() for each in urls]
kkp=temp+this_time
data.append(kkp)
print(kkp)
time.sleep(1)
# soup = BeautifulSoup(html, 'html.parser')
# map_node = soup.find_all("tbody")
# div_list = soup.find_all('tr', class_='title')
# for row in soup.select('tbody tr'):
# row_text = [x.text for x in row.find_all('td')]
# print(', '.join(row_text)) # You can save or print this string however you want.
# html = etree.HTML(rep)
# # 获取电影封面图 电影名称 xpath定位提取 得到的是列表
# src = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/img/@src')
# name = html.xpath('//*[@id="content"]/div/div[1]/ol/li/div/div[1]/a/img/@alt')
# # 保存到本地
# for src, name in zip(src, name):
# file_name = name + ".jpg"
# img = requests.get(src, headers=headers).content
# with open(path + "/" + file_name, "wb") as f:
# f.write(img)
# if __name__ == "__main__":
# # 列表推导式得到url列表 10页的电影信息 Top250
# url_list = ["https://movie.douban.com/top250?start={}&filter=".format(x * 25) for x in range(10)]
# for url in url_list:
# get_pic(url)
# delta = (datetime.datetime.now() - start_time).total_seconds()
# print("抓取250张电影封面图用时:{}s".format(delta))
爬豆瓣小组
最新推荐文章于 2025-05-14 17:46:45 发布