1.目标网站:https://www.1point3acres.com/bbs/forum-28-1.html此处开始的若干页
2.首先创建两个队列,一个页面队列和一个用于I/O的队列。顺便创建个锁,防止写的时候出问题
page_queue = Queue()
joke_queue = Queue()
gLock = threading.Lock()
3.用CSV存储数据
fp = open('asd.csv','a+',newline='',encoding='utf-8')
url = 'https://www.1point3acres.com/bbs/forum-28-1.html'
writer = csv.writer(fp)
writer.writerow(('标题','链接'))
4.寻找最大页码
max_page = find_max_page(url)
def find_max_page(url):
selector = comp(url)
max_page = selector.xpath('//div[@class="pg"]//span/text()')
if max_page :
max_page = max_page[0]
max_page = int(re.findall('\d+',max_page)[0])
return max_page
else:
return
5.循环如栈,把页压入队列内
for x in range(1,max_page):
url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x)
page_queue.put(url)
for x in range(4):
t = BSSpider(page_queue,joke_queue)
t.start()
for x in range(4):
t = BSWriter(joke_queue,writer,gLock)
t.start()
6.解析线程代码如下:
class BSSpider(threading.Thread):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
# }
headers = {
'User-Agent': get_ua(),
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept - encoding':'gzip, deflate, br',
'accept - language':'zh - CN, zh;q = 0.9',
'referer': 'https://www.1point3acres.com/bbs/',
'upgrade - insecure - requests':'1',
'Connection': 'keep-alive',
}
# ip代理池
proxies = {
'http': '123.54.44.60:9999',
'http': '182.101.207.11:8080',
'http': '121.232.148.231:9000',
'http': '183.166.163.61:9999',
'http': '175.44.108.179:9999',
'http': '175.43.155.36:9999',
'http': '39.108.59.34:8118',
'http': '219.159.38.207:56210',
'http': '113.194.48.14:9999',
'http': '163.125.220.175:8118',
'http': '123.149.136.180:9999',
'http': '121.232.194.37:9000',
'http': '1.85.5.66:8060',
'http': '125.108.100.20:9000',
'http': '114.101.252.37:3000',
}
def __init__(self,page_queue,joke_queue,*args,**kwargs):
super(BSSpider,self).__init__(*args,**kwargs)
# 基域名
self.base_domain = 'https://www.1point3acres.com/bbs/'
self.page_queue = page_queue
self.joke_queue = joke_queue
def run(self):
while True:
# 如果页面队列为空,则break退出
if self.page_queue.empty():
break
# 从页面队列取出url
url = self.page_queue.get()
print(url)
# 设置重传
requests.adapters.DEFAULT_RETRIES = 30
response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text
html = etree.HTML(response)
titles = html.xpath('//a[contains(@class,"xst")]/text()')
urls = html.xpath('//a[contains(@class,"xst")]/@href')
# print(titles,urls)
for title,link in zip(titles,urls):
link = self.base_domain + link
# 把得到的数据压入I/O队列中
self.joke_queue.put((title,link))
print('完成一页')
6.I/O线程代码如下:
class BSWriter(threading.Thread):
def __init__(self,joke_queue,writer,gLock,*args,**kwargs):
super(BSWriter,self).__init__(*args,**kwargs)
self.joke_queue = joke_queue
self.writer = writer
# 用于写入的锁
self.lock = gLock
def run(self):
while True:
try:
joke_info = self.joke_queue.get(timeout=40)
title,link = joke_info
# 写入之前上锁
self.lock.acquire()
self.writer.writerow((title,link))
self.lock.release()
# print('保存一条')
except:
break
7.完整代码如下:
# -*- encoding: utf-8 -*-
#@Time: 15:40
#@Software:PyCharm
import requests
from lxml import etree
import threading
from queue import Queue
import csv
import random
import re
import time
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
# def get_ua():
# user_agents = [
# "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.1 Safari/537.36",
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2227.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2226.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.4; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2225.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2224.3 Safari/537.36",
# "Mozilla/5.0 (Windows NT 10.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/40.0.2214.93 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.124 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 4.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2049.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.67 Safari/537.36",
# "Mozilla/5.0 (X11; OpenBSD i386) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1944.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.3319.102 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2309.372 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.2117.157 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1866.237 Safari/537.36",
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.137 Safari/4E423F",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/34.0.1847.116 Safari/537.36 Mozilla/5.0 (iPad; U; CPU OS 3_2 like Mac OS X; en-us) AppleWebKit/531.21.10 (KHTML, like Gecko) Version/4.0.4 Mobile/7B334b Safari/531.21.10",
# "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/33.0.1750.517 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1667.0 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/32.0.1664.3 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.16 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1623.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.17 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.62 Safari/537.36",
# "Mozilla/5.0 (X11; CrOS i686 4319.74.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.57 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.2 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1468.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1467.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/28.0.1464.0 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1500.55 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.90 Safari/537.36",
# "Mozilla/5.0 (X11; NetBSD) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
# "Mozilla/5.0 (X11; CrOS i686 3912.101.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.116 Safari/537.36",
# "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.60 Safari/537.17",
# "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_2) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1309.0 Safari/537.17",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.15 (KHTML, like Gecko) Chrome/24.0.1295.0 Safari/537.15",
# "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.14 (KHTML, like Gecko) Chrome/24.0.1292.0 Safari/537.14",
# 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36',
# ]
# user_agent = random.choice(user_agents) # random.choice(),从列表中随机抽取一个对象
# return user_agent
class BSSpider(threading.Thread):
# headers = {
# 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36'
# }
headers = {
'User-Agent': get_ua(),
'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'accept - encoding':'gzip, deflate, br',
'accept - language':'zh - CN, zh;q = 0.9',
'referer': 'https://www.1point3acres.com/bbs/',
'upgrade - insecure - requests':'1',
'Connection': 'keep-alive',
}
# ip代理池
proxies = {
'http': '123.54.44.60:9999',
'http': '182.101.207.11:8080',
'http': '121.232.148.231:9000',
'http': '183.166.163.61:9999',
'http': '175.44.108.179:9999',
'http': '175.43.155.36:9999',
'http': '39.108.59.34:8118',
'http': '219.159.38.207:56210',
'http': '113.194.48.14:9999',
'http': '163.125.220.175:8118',
'http': '123.149.136.180:9999',
'http': '121.232.194.37:9000',
'http': '1.85.5.66:8060',
'http': '125.108.100.20:9000',
'http': '114.101.252.37:3000',
}
def __init__(self,page_queue,joke_queue,*args,**kwargs):
super(BSSpider,self).__init__(*args,**kwargs)
# 基域名
self.base_domain = 'https://www.1point3acres.com/bbs/'
self.page_queue = page_queue
self.joke_queue = joke_queue
def run(self):
while True:
# 如果页面队列为空,则break退出
if self.page_queue.empty():
break
# 从页面队列取出url
url = self.page_queue.get()
print(url)
# 设置重传
requests.adapters.DEFAULT_RETRIES = 30
response = requests.get(url,headers=self.headers,proxies=self.proxies,timeout=100).text
html = etree.HTML(response)
titles = html.xpath('//a[contains(@class,"xst")]/text()')
urls = html.xpath('//a[contains(@class,"xst")]/@href')
# print(titles,urls)
for title,link in zip(titles,urls):
link = self.base_domain + link
# 把得到的数据压入I/O队列中
self.joke_queue.put((title,link))
print('完成一页')
class BSWriter(threading.Thread):
def __init__(self,joke_queue,writer,gLock,*args,**kwargs):
super(BSWriter,self).__init__(*args,**kwargs)
self.joke_queue = joke_queue
self.writer = writer
# 用于写入的锁
self.lock = gLock
def run(self):
while True:
try:
joke_info = self.joke_queue.get(timeout=40)
title,link = joke_info
# 写入之前上锁
self.lock.acquire()
self.writer.writerow((title,link))
self.lock.release()
# print('保存一条')
except:
break
def main():
page_queue = Queue()
joke_queue = Queue()
gLock = threading.Lock()
fp = open('asd.csv','a+',newline='',encoding='utf-8')
url = 'https://www.1point3acres.com/bbs/forum-28-1.html'
writer = csv.writer(fp)
writer.writerow(('标题','链接'))
max_page = find_max_page(url)
for x in range(1,max_page):
url = 'https://www.1point3acres.com/bbs/forum-28-{}.html'.format(x)
page_queue.put(url)
for x in range(4):
t = BSSpider(page_queue,joke_queue)
t.start()
for x in range(4):
t = BSWriter(joke_queue,writer,gLock)
t.start()
def find_max_page(url):
selector = comp(url)
max_page = selector.xpath('//div[@class="pg"]//span/text()')
if max_page :
max_page = max_page[0]
max_page = int(re.findall('\d+',max_page)[0])
return max_page
else:
return
# def comp(url):
# ua = get_ua()
# headers = {
# 'User-Agent': ua,
# 'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
# 'accept - encoding': 'gzip, deflate, br',
# 'accept - language': 'zh - CN, zh;q = 0.9',
# 'referer': 'https://www.1point3acres.com/bbs/',
# 'upgrade - insecure - requests': '1',
# 'Connection': 'keep-alive',
# }
# proxies = {
# 'http': '123.54.44.60:9999',
# 'http': '182.101.207.11:8080',
# 'http': '121.232.148.231:9000',
# 'http': '183.166.163.61:9999',
# 'http': '175.44.108.179:9999',
# 'http': '175.43.155.36:9999',
# 'http': '39.108.59.34:8118',
# 'http': '219.159.38.207:56210',
# 'http': '113.194.48.14:9999',
# 'http': '163.125.220.175:8118',
# 'http': '123.149.136.180:9999',
# 'http': '121.232.194.37:9000',
# 'http': '1.85.5.66:8060',
# 'http': '125.108.100.20:9000',
# 'http': '114.101.252.37:3000',
# }
# requests.adapters.DEFAULT_RETRIES = 30
# html_data = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
# # html_data.encoding = html_data.apparent_encoding
# html = html_data.text
# selector = etree.HTML(html)
# return selector
if __name__ == '__main__':
main()