三元组数据爬取
import requests #get
from bs4 import BeautifulSoup #解析网页
from pyquery import PyQuery as pq
import urllib
import time
from requests.exceptions import RequestException
import os #文件与目录的操作
header={"Accept-Encoding":"utf-8",'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
#'Cookie':'53gid2=11032128087000; 53revisit=1635492789210; suggest=1; pageSize=10; history=1; news.suggest=1; news.pageSize=10; news.history=1; Hm_lvt_01516e39a68d06a0ee1086c84061d6fd=1639316150,1639319898,1639319902,1639389224; Hm_lvt_5eba60fe8b3ad191defab76fc103306b=1639316150,1639319898,1639319902,1639389224; Hm_lpvt_5eba60fe8b3ad191defab76fc103306b=1639389224; Hm_lvt_55138d9865dc25736cb1df4e1cdc83d3=1639104938,1639316153,1639319901,1639389226; visitor_type=old; 53gid0=11032128087000; 53kf_72167138_from_host=zhidao.yongzin.com; 53kf_72167138_keyword=https://www.yongzin.com/; 53kf_72167138_land_page=https%3A%2F%2Fzhidao.yongzin.com%2F; kf_72167138_land_page_ok=1; 53uvid=1; onliner_zdfq72167138=0; fae6e848a5e14c6caef22bd7a54f4f51=WyIzNzE1ODI2Mzc3Il0; JSESSIONID=BDAE8BCBD717E474D39C677836BD292F; token=baab91ed-ac0c-42de-88df-a9c25860dff6; Hm_lpvt_55138d9865dc25736cb1df4e1cdc83d3=1639409807; Hm_lpvt_01516e39a68d06a0ee1086c84061d6fd=1639409807'
'Cookie':'53gid2=11032126359000; 53revisit=1635492779842; Hm_lvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lpvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lvt_01516e39a68d06a0ee1086c84061d6fd=1653542246; b9455d04ae144ea1947fd21d26372d4c=WyIzMDczNDcxNTI5Il0; suggest=1; pageSize=10; history=1; news.suggest=1; news.pageSize=10; news.history=1; visitor_type=old; 53gid0=11032126359000; 53gid1=11032126359000; 53kf_72167138_from_host=baike.yongzin.com; 53kf_72167138_keyword=https://wenku.yongzin.com/; 53kf_72167138_land_page=https%3A%2F%2Fbaike.yongzin.com%2F; kf_72167138_land_page_ok=1; Hm_lvt_d07a7658543359ec17ca5a94e20b7a14=1653542259; 53uvid=1; onliner_zdfq72167138=0; JSESSIONID=E9BFF79059D76C711EB517B52F8BBBFB; weblocal=zh_ZW; token=4187dbd8-910b-4864-a7de-30b387b985c5; Hm_lpvt_d07a7658543359ec17ca5a94e20b7a14=1653542351; Hm_lpvt_01516e39a68d06a0ee1086c84061d6fd=1653542351'
, 'Cookie':'53gid2=11032126359000; 53revisit=1635492779842; searchlog=[{"word":"ཉིན་རེའི་འོས་སྦྱོརà¼"},{"word":"དང་à½à½„་སཾ་གྱིས་བདག་པོར་གསོལ་བའི་ཡི་གེ"}]; Hm_lvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lpvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lvt_01516e39a68d06a0ee1086c84061d6fd=1653542246; b9455d04ae144ea1947fd21d26372d4c=WyIzMDczNDcxNTI5Il0; suggest=1; pageSize=10; history=1; news.suggest=1; news.pageSize=10; news.history=1; visitor_type=old; 53gid0=11032126359000; 53gid1=11032126359000; 53kf_72167138_from_host=baike.yongzin.com; 53kf_72167138_keyword=https://wenku.yongzin.com/; 53kf_72167138_land_page=https%3A%2F%2Fbaike.yongzin.com%2F; kf_72167138_land_page_ok=1; Hm_lvt_d07a7658543359ec17ca5a94e20b7a14=1653542259; 53uvid=1; onliner_zdfq72167138=0; JSESSIONID=E9BFF79059D76C711EB517B52F8BBBFB; weblocal=zh_ZW; token=4187dbd8-910b-4864-a7de-30b387b985c5; Hm_lpvt_d07a7658543359ec17ca5a94e20b7a14=1653542351; Hm_lpvt_01516e39a68d06a0ee1086c84061d6fd=1653542351'
}
def Nestype_url(url,name):
path = "云藏百科三元组数据\\"
if not os.path.exists(path): # 检查文件是否建立
os.makedirs(path)
page_title=[]
page_link=[]
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
soup = pq(content)
dire = soup('div.main-wrap div.mt20 div.p0 div.row div.f16 h4')
links = dire('a')
for link in links.items():
li = link.attr('href')
title = link.text()
page_link.append(url+'/'+li)
page_title.append(title)
response.close()
for i in range(len(page_title)):
# print(page_link[i], page_title[i], path)
get_page_url(page_link[i], page_title[i], path)
def get_page_url(link,title,path,num=[0]):
text_url = []
text_name = []
# path = path+"\\{}".format(title)
if not os.path.exists(path): # 检查文件是否建立
os.makedirs(path)
linker = link
for i in range(1,21):
url = linker + '&pageNo={}'.format(i)
print("---------------第{}页------------------".format(i))
print(url)
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
html = pq(content)
items = html("div.thr-wrap div.sortList_lf div.textbox")
for item in items.items():
link = item('h4.f16 a').attr('href')
title = item("h4.f16 a").text()
# print(link,title)
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{\}~{ }':
title = title.replace(ch, "") #去除符号
title.replace('.txt','')
text_name.append(title)
text_url.append("https://baike.yongzin.com"+link)
response.close()
for i in range(len(text_name)):
# print(text_name[i],text_url[i])
get_text(text_name[i],text_url[i],path)
time.sleep(1)
text_name.clear()
text_url.clear()
def get_text(name,url,path,no=[2386]):
# print(url, name)
page_triple1 = []
page_triple2 = []
base_url = open("三元组网页记录test.txt", 'r', encoding='utf-8').read() # 存放访问网页的历史记录
if url in base_url: # 判断之前是否访问过
None
else:
with open(path + '\\{}.txt'.format('百科三元组数据test'), 'a', encoding='utf-8') as f:
response = requests.get(url, headers=header)
try:
content = response.content.decode("utf-8", "ignore")
html = pq(content)
items = html("div.main-wrap div.three_com_l div.mt30 div.col-xs-6")
for item in items.items():
title1 = item("div.w1").text()
title2 = item("div.w2").text()
page_triple1.append(title1)
page_triple2.append(title2)
response.close()
no[0] += 1
print(no[0])
f.write("{}#{}#".format(no[0],name))
for i in range(len(page_triple1)):
print(page_triple1[i], '%', page_triple2[i])
f.write("{};{}.".format(page_triple1[i], page_triple2[i]))
f.write("\n")
except:
None
with open("三元组网页记录test.txt", 'a', encoding='utf-8')as f:
f.write("{}\n".format(url)) # 保存正在访问的网页
def get_head_page(url):
response = requests.get(url, headers=header)
html = response.content.decode("utf-8")
page_name =[]
page_url=[]
docment = pq(html)
links = docment('header ul.f16 li a').items()
for item in links:
link = item.attr("onclick")[17:-3]
name = item.text()
page_name.append(name)
page_url.append(link)
response.close()
for i in range(len(page_url)):
if i == 5:
print(page_name[i],page_url[i])
Nestype_url(page_url[i],page_name[i])
if __name__ == '__main__':
url = 'https://www.yongzin.com/'
global uid
uid=0
get_head_page(url)入代码片
问答爬取
import requests #get
from bs4 import BeautifulSoup #解析网页
from pyquery import PyQuery as pq
import urllib
import selenium
from selenium import webdriver
import time
import math
from requests.exceptions import RequestException
import os #文件与目录的操作
header={"Accept-Encoding":"utf-8",'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64)\
AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36',
#'Cookie':'53gid2=11032128087000; 53revisit=1635492789210; Hm_lvt_5eba60fe8b3ad191defab76fc103306b=1639389224,1639465591,1639630099,1639751188; Hm_lpvt_5eba60fe8b3ad191defab76fc103306b=1639751188; Hm_lvt_01516e39a68d06a0ee1086c84061d6fd=1639465591,1639491690,1639630099,1639751188; fae6e848a5e14c6caef22bd7a54f4f51=WyIzNzE1ODI2Mzc3Il0; pageSize=10; history=1; suggest=1; news.suggest=1; news.pageSize=10; news.history=1; Hm_lvt_55138d9865dc25736cb1df4e1cdc83d3=1639465594,1639491690,1639630101,1639751191; JSESSIONID=12303B4D4A91576A07F2C7D735E1AA4F; visitor_type=old; 53gid0=11032128087000; 53gid1=11032128087000; 53kf_72167138_from_host=zhidao.yongzin.com; 53kf_72167138_keyword=https://www.yongzin.com/; 53kf_72167138_land_page=https%3A%2F%2Fzhidao.yongzin.com%2F; kf_72167138_land_page_ok=1; 53uvid=1; onliner_zdfq72167138=0; token=75d0ab6a-c066-4dbd-b134-8eaf5ca94fb0; Hm_lpvt_55138d9865dc25736cb1df4e1cdc83d3=1639752390; Hm_lpvt_01516e39a68d06a0ee1086c84061d6fd=1639752390'
'Cookie':'53gid2=11032126359000; 53revisit=1635492779842; Hm_lvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lpvt_5eba60fe8b3ad191defab76fc103306b=1653542246; Hm_lvt_01516e39a68d06a0ee1086c84061d6fd=1653542246; b9455d04ae144ea1947fd21d26372d4c=WyIzMDczNDcxNTI5Il0; suggest=1; pageSize=10; history=1; news.suggest=1; news.pageSize=10; news.history=1; visitor_type=old; 53gid0=11032126359000; 53gid1=11032126359000; 53kf_72167138_from_host=baike.yongzin.com; 53kf_72167138_keyword=https://wenku.yongzin.com/; 53kf_72167138_land_page=https%3A%2F%2Fbaike.yongzin.com%2F; kf_72167138_land_page_ok=1; Hm_lvt_d07a7658543359ec17ca5a94e20b7a14=1653542259; 53uvid=1; onliner_zdfq72167138=0; JSESSIONID=E9BFF79059D76C711EB517B52F8BBBFB; weblocal=zh_ZW; token=4187dbd8-910b-4864-a7de-30b387b985c5; Hm_lpvt_d07a7658543359ec17ca5a94e20b7a14=1653542351; Hm_lpvt_01516e39a68d06a0ee1086c84061d6fd=1653542351'
}
# 获得11个大类的链接
def Nestype_url(url,name):
path = "云藏QA数据\\"
if not os.path.exists(path): # 检查文件是否建立
os.makedirs(path)
page_title=[]
page_link=[]
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
soup = pq(content)
dire = soup('div.thr-wrap2 div.mt10 div.ml20 div.title01')
links = dire('a')
for link in links.items():
li = link.attr('href')
title = link.text()
page_link.append("https://zhidao.yongzin.com"+li)
page_title.append(title)
response.close()
for i in range(0,len(page_title)):
print("---------第{}类-------------".format(i+1))
print(page_link[i], page_title[i], path)
# Mini_category(page_link[i], page_title[i],path)
get_page_url(page_link[i],path)
# 获得小类的链接
def Mini_category(url,name,path):
page_title=[]
page_link=[]
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
soup = pq(content)
dire = soup('div.thr-wrap2 div.mt10 div.ml20 div.title01')
links = dire('a')
for link in links.items():
li = link.attr('href')
title = link.text()
page_link.append("https://zhidao.yongzin.com"+li)
page_title.append(title)
response.close()
for i in range(len(page_title)):
print(page_link[i], page_title[i])
get_page_url(page_link[i],path)
# 翻页
def get_page_url(url,path):
#首先获取每大类类的页数
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
html = pq(content)
page_num = html('div.thr-wrap2 div.mt10 div.sortList_rf4 div.pb_box1 div.p15 div.red').text()
num=int(math.ceil(int(page_num)/10))
print(num)
text_url = []
text_name = []
dirver=webdriver.Chrome()
dirver.get(url)
dirver.find_element_by_link_text('ཐག་བཅད་དྲི་བ།').click()
button = dirver.find_element_by_xpath('//*[@id="rpage"]/ul/li[2]/a[1]')
dirver.execute_script("arguments[0].click();", button)
for i in range(0, num):
if (not dirver.find_element_by_link_text('ཤོག་ངོས་ཞོལ་མ།')):
None
else:
try:
time.sleep(5)
text = dirver.page_source
html = pq(text)
dire = html('div.thr-wrap2 div.mt10 div.ml20 div.mt40 div.lists div#rlist ul li h2')
links = dire('a')
for link in links.items():
li = link.attr('href')
title = link.text()
text_url.append("https://zhidao.yongzin.com" + li)
for ch in '!"#$%&()*+,-./:;<=>?@[\\]^_`{\}~{ }':
title = title.replace(ch, "") # 去除符号
title.replace('.txt', '')
text_name.append(title)
for i in range(len(text_url)):
time.sleep(0.8)
print(text_url[i], text_name[i])
get_text(text_url[i], text_name[i], path)
text_name.clear()
text_url.clear()
dirver.find_element_by_link_text('ཤོག་ངོས་ཞོལ་མ།').click()
except:
None
#获得文本
def get_text(url,name,path,no=[0]):
base_url = open("问答网页记录56333条.txt", 'r', encoding='utf-8').read() # 存放访问网页的历史记录
if url in base_url: # 判断之前是否访问过
None
else:
with open(path + '\\{}.txt'.format('QA数据56333条'), 'a', encoding='utf-8') as f:
response = requests.get(url, headers=header)
try:
content = response.content.decode("utf-8", "ignore")
html = pq(content)
question = html('div.main-wrap div.three_com_l div.three-know-title01 span.brown').text()
answer = html('div.main-wrap div.three_com_l div.three-know-l-box01 ul li p').text()
# print(summary, paragraph)
response.close()
no[0] += 1
print(no[0])
f.write("{}#{}@{}#{}\n".format(no[0], name, question, answer))
except:
None
with open("问答网页记录56333条.txt", 'a', encoding='utf-8')as f:
f.write("{}\n".format(url)) # 保存正在访问的网页
def get_index_page(url,name):
response = requests.get(url, headers=header)
content = response.content.decode("utf-8", "ignore")
html = pq(content)
index_url= html("div.main-wrap div.row div.p15 div.row").children().next().attr("href")
index_name= html("div.main-wrap div.row div.p15 div.row").children().next().children().next().text()
response.close()
print(index_url, index_name)
Nestype_url(index_url, index_name)
def get_head_page(url):
response = requests.get(url, headers=header)
html = response.content.decode("utf-8")
page_name =[]
page_url=[]
docment = pq(html)
links = docment('header ul.f16 li a').items()
for item in links:
link = item.attr("onclick")[17:-3]
name = item.text()
page_name.append(name)
page_url.append(link)
response.close()
for i in range(len(page_url)):
if i == 7:
print(page_name[i],page_url[i])
get_index_page(page_url[i],page_name[i])
if __name__ == '__main__':
url = 'https://www.yongzin.com/'
global uid
uid=0
get_head_page(url)