这里写自定义目录标题
小工具写着玩
之前有一个小业务,需要进行批量邮箱协议群发,然后需要采集一些真实的邮箱数据,最后想了一下让用户自己留下的基本上都是真实的,tieba又属于一个流量比较大的地方,后来就写了一个小教本进行测试;
工具还可以进行继续的延伸,比如说批量采集帖子,或者说直接 输入某个贴吧名后,直接把这个吧里面的所有帖子都采集一次;目前贴吧
其实也有比较大的风控,所以cookie 需要自行去解决
import os
import re
import requests
import time
import urllib3
from bs4 import BeautifulSoup
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
#帖子请求
def get_tieba_url_request(url):
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'Accept-Language':'zh-CN,zh;q=0.9',
'Connection':'keep-alive',
'Cookie':'XFI=a7d17700-8440-11ee-83ac-c1665e3fb88c; XFCS=5FE2CCE3B26175874395D9B57093E47CB9FCA2C663EE7C7C349D9D838F2993BF; XFT=NJTyvqxEhVRCgUAJ2+P0fuIaHOrdJfqa3sC+jNOG9Do=; BIDUPSID=EDDAF011D517DD5F8526B74215AB4EC9; PSTM=1688621075; BAIDUID=38E64620B89BF9ED7F4418C078F3FA7A:FG=1; BDUSS=VczYmZrT0w5aTRvb09WdDBWWHdXRlo2aFdSUzYwV3NzWFlvQURYT05EZGREUjVsSVFBQUFBJCQAAAAAAQAAAAEAAABuKrw~xrbH7su5tdm30gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAF2A9mRdgPZkQ; BDUSS_BFESS=VczYmZrT0w5aTRvb09WdDBWWHdXRlo2aFdSUzYwV3NzWFlvQURYT05EZGREUjVsSVFBQUFBJCQAAAAAAQAAAAEAAABuKrw~xrbH7su5tdm30gAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAF2A9mRdgPZkQ; newlogin=1; STOKEN=ddb417856b1e3da56ca796f300506e915f4ddd993c855891aab28ba8cda5b3e9; H_WISE_SIDS_BFESS=110085_264354_268593_271170_270102_274778_276533_271563_277031_277161_277355_275732_278055_272563_278575_278388_279021_279610_277757_279012_279749_279999_278415_259642_276438_280809_280557_278791_280635_281038_281235_277970_281232_281367_279203_280437_281094_281520; Hm_lvt_98b9d8c2fd6608d564bf2ac2ae642948=1698126102,1698902772,1700098710; USER_JUMP=-1; st_key_id=17; 5364263534_FRSVideoUploadTip=1; video_bubble5364263534=1; wise_device=0; H_PS_PSSID=39635_39669_39673_39664_39684_39694_39676_39678_39713_39740; BDORZ=FFFB88E999055A3F8A630C64834BD6D0; BAIDUID_BFESS=38E64620B89BF9ED7F4418C078F3FA7A:FG=1; ZFY=Wb1tH:AZ4:BZj9y2:BJUusTBjVYzyZics9VP4QaL22tPN0:C; delPer=0; PSINO=3; BAIDU_WISE_UID=wapp_1700102029361_689; arialoadData=false; H_WISE_SIDS=39635_39669_39673_39664_39684_39694_39676_39678_39713_39740; XFI=956c21e0-843c-11ee-b883-45510d578902; XFCS=A1395407BFF53914CDBA12B8F843DDC3ED65E5DBEF1285BBA053FCAA4C9F50BD; XFT=7ZvJaZXFmql3O5yIAm+Ku0cTRLQZ/UdO/82Pe+Gj2BY=; RT="z=1&dm=baidu.com&si=77dad1ea-ef2d-417f-9960-7117c29c953a&ss=lp0pt9zo&sl=x&tt=r9d&bcn=https%3A%2F%2Ffclog.baidu.com%2Flog%2Fweirwood%3Ftype%3Dperf&ld=5xsr&nu=cebq8ru3&cl=5xe6&ul=5xh3&hd=5y6b"; tb_as_data=cf23fbb996369c3d712df0cb4cd906d2df97a42545d16b9077a3b0d9f3e79c303ad42750500d9e3e2d5395f7eefe9971cc996474adf7b3e70dcacfe2aefcece668659c3b960735aa8cb9a267be1b27ce5f92d983bd0c6e996e384378ce64b00b1d5d8f2b9aa676c5dbe966eed970df73; Hm_lpvt_98b9d8c2fd6608d564bf2ac2ae642948=1700112372; BA_HECTOR=2485812g80al85agak2l8h231ilb9vk1r; ab_sr=1.0.1_MWQ3ODdkYzdkMDhkODlkMGE1ZjhjMDFmZWE2YTI3Njc1ODQ3MGU3Y2MyNjkxMTllZDYxYThmMzMwMzViZmZhOGU3YTc1NzIyNWE2NDNiZTAzODQwMmUwYjhiMGU5OTg5MjNhMjU5OGQ2ZTY1Nzg2MjQ5ZTBkNWZkMzQ5MmQ2MWJiNzA5ZjljODdmNWU3YjU2M2FjZjUxMGQ4NDVmMWIyMjNhZTMxNDA3ZTY4ZTQ2MDdmYTlkMzljNmQ2ZjRmYzhm; st_data=2280640e7dd502f250f770692dbb46081a224de35ef3b6f396493a615f071feb338a07deb489b09ec58afd7e0024375cc52019d6f4fdb22dd5abdaa6cb1ed526127793dac697c20951d94eb99604ab75e8ec3baddb3041b57ee5809b35bf413327e095eda8ab467f0e5bdbff52f13ecaf10734b5a452918eab2a6a2fe8f88dc74fb65af3e23ccf090b2e4b9933939ff8; st_sign=b4d2cd91',
'Host':'tieba.baidu.com',
'Referer':'https://tieba.baidu.com/f?ie=utf-8&kw=%E5%A4%96%E8%B4%B8%E8%8B%B1%E8%AF%AD&fr=search',
'Upgrade-Insecure-Requests':'1',
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36'
}
res = requests.get(url=url,headers=headers,verify=False,timeout=10)
if res.status_code == 200:
print(f'{url}访问成功>>>')
html_data = res.text
return html_data
#获取帖子页数
def get_page(html_data):
soup = BeautifulSoup(html_data,'lxml')
page = soup.select('#thread_theme_5 > div.l_thread_info > ul > li:nth-child(2) > span:nth-child(2)')[0].get_text()
print(f'页数一共=>{page}')
return page
#循环访问页数
def get_tieba_page_request(url,page):
for i in range(1,int(page)):
html_data = get_tieba_url_request(url+'?pn=' + str(i))
html_data=html_data.encode('utf-8').decode('raw_unicode_escape')
email_list = re.findall(r'(\w+@\w+\.[a-z]+)',html_data,re.S)
print(email_list)
time.sleep(5)
if __name__ == '__main__':
url = 'https://tieba.baidu.com/p/7371997'
html_data = get_tieba_url_request(url)
page = get_page(html_data)
get_tieba_page_request(url, page)