任务
爬一个帖子内各个楼层的用户名,发表内容,时间,最后按照时间顺序排列各个元素
重要步骤
- 解析网页
使用的是beautifulsoup解析,很方便很快捷!如果需要直接使用我的代码请查看class的名字是否相同,不同的直接修改即可!
response = requests.get(spider_url, headers=headers).content
soup = BeautifulSoup(response, "html.parser")
# soup = BeautifulSoup(open("page{}.html".format(page)), "html.parser")
for j in soup.find_all("div", class_="l_post l_post_bright j_l_post clearfix"):
floor = [j.find_all("li", class_="d_name")[0].text.strip(),
j.find_all("div", class_="d_post_content j_d_post_content")[0].text.strip(),
j.find_all("div", class_="core_reply_tail clearfix")[0].text[-16:].strip()]
print(floor)
self.data_queue.put(floor)
-
多线程
我使用了4个线程,主要是为了做展示!线程和队列结合使用!可以简单理解为很多工人操作一条流水线!定义一个页码队列,每个线程都去改队列拿取页码,保证每个线程不回重复爬去,最后将结果放入到结果队列中!详情就差看我的代码 -
细节处理
因为百度是有反爬的!所以在刚开始解析网页的时候,我建议是不要一直发请求,而是将页面保存到本地,多保存几页,然后写测试,测试成功的话再发真正的请求!以免ip被封!如果被封,等差不多半个小时即可。所以很多人爬取的时候一直没结果,建议检查这里!
结果展示
完整代码(注意spider_url即贴吧地址)
import csv
import threading
from queue import Queue
import requests
import pandas as pd
from bs4 import BeautifulSoup
CRAWL_EXIT = False
class ThreadCrawl(threading.Thread):
def __init__(self, thread_name, page_queue,data_queue):
# 调用父类初始化方法
super(ThreadCrawl, self).__init__()
self.threadName = thread_name
self.page_queue = page_queue
self.data_queue = data_queue
def run(self):
print(self.threadName + ' 启动************')
while not CRAWL_EXIT:
page = self.page_queue.get(block=False) # 从里面获取值
# URL就是网页地址
headers = {
'Accept': 'application/json, text/javascript, */*; q=0.01',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'zh-CN,zh;q=0.9',
'Connection': 'Keep-Alive',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.88 Safari/537.36',
'sec-ch-ua': '"Google Chrome";v="87", " Not;A Brand";v="99", "Chromium";v="87"'
}
spider_url = "https://tieba.baidu.com/p/6132068127?pn={}".format(page)
print(spider_url)
response = requests.get(spider_url, headers=headers).content
soup = BeautifulSoup(response, "html.parser")
# soup = BeautifulSoup(open("page{}.html".format(page)), "html.parser")
for j in soup.find_all("div", class_="l_post l_post_bright j_l_post clearfix"):
floor = [j.find_all("li", class_="d_name")[0].text.strip(),
j.find_all("div", class_="d_post_content j_d_post_content")[0].text.strip(),
j.find_all("div", class_="core_reply_tail clearfix")[0].text[-16:].strip()]
print(floor)
self.data_queue.put(floor)
# 保存数据到csv文件
def toCsv(s):
with open('data.csv', 'a', encoding="utf-8_sig", newline='') as f:
csv.writer(f, dialect='excel').writerow(s)
print('写入完成')
# 时间排序函数
def date_sort(x):
ls = list(x)
#冒泡排序
for j in range(len(ls)-1):
for i in range(len(ls)-j-1):
lower = ls[i].split('-')
upper = ls[i+1].split('-')
for s in range(3):
if int(lower[s]) > int(upper[s]):
ls[i],ls[i+1] = ls[i+1],ls[i]
break
elif int(lower[s]) < int(upper[s]):
break
return ls
def main():
# 声明一个队列,使用循环在里面存入100个页码
page_queue = Queue(10)
for i in range(1,11):
page_queue.put(i)
print(page_queue.queue)
data_queue = Queue(maxsize=0)
craw_list = ['采集线程1号','采集线程2号','采集线程3号','采集线程4号']
thread_crawl = []
for thread_name in craw_list:
c_thread = ThreadCrawl(thread_name, page_queue,data_queue)
c_thread.start()
thread_crawl.append(c_thread)
# 等待page_queue队列为空,也就是等待之前的操作执行完毕
while not page_queue.empty():
pass
# 如果page_queue为空,采集线程退出循环
global CRAWL_EXIT
CRAWL_EXIT = True
print(data_queue.empty())
result_before = []
while not data_queue.empty():
result_before.append(data_queue.get())
result = sort_time(result_before)
for i in result:
toCsv(i)
def sort_time(list):
df = pd.DataFrame(columns=["用户名", "发表内容", "时间"], data=list)
df['时间'] = pd.to_datetime(df['时间'])
df.sort_values('时间', inplace=True)
return df.values.tolist()
if __name__ == '__main__':
main()
如果你有任何问题或者学习交流请联系我的邮箱:yongboyneverdie@163.com