爬这个网站的时候学到了比较多的东西,也对python的基础知识点有了更好的了解。
首先是.join的用法,处理爬到的text还是比较好用的
然后时xpath的following::用法,取该节点后面的所有同级节点,很好用
然后是一个困惑,不知道什么原因,爬取导师的全部信息的时候用string只能爬取最后一行,需要用text(),希望是我的vscode出的问题
以后如果没有特殊情况,爬学校网站的代码就不贴上来了,并不是很有意思
接下来是代码
import requests
from lxml import etree
import random
from string import punctuation
import re
import time
import pymongo
from pymongo import MongoClient
def download(url):
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.169 Safari/537.36'}
time.sleep(1)
r = requests.get(url,headers=headers)
r.encoding='utf-8'
return etree.HTML(r.text)
def w_d(list):
str = '\n'.join(list)
return str
def write_down(teacher_imfo):
client=MongoClient()
db=client.jiliang_teacher_data
collection = db.teacher_imfo
collection.insert_one(teacher_imfo)
print('正在下载:'+teacher_imfo['姓名:'])
def deep_spider(url):
selector = download(url)
a=selector.xpath('//*[@align="center"]')
tag=a[1].xpath('following::td/span/text()')
cont=a[2].xpath('following::td/text()')
dict={}
for i in range(1,10):
cont.append(' ')
for i in range(len(tag)):
dict[tag[i]]=cont[i]
keti=selector.xpath('/html/body/div[3]/div/table/tr[9]/td/p/text()')
huojiang=selector.xpath('/html/body/div[3]/div/table/tr[11]/td/p/text()')
jinqi=selector.xpath('/html/body/div[3]/div/table/tr[13]/td/p/text()')
zhuchi=selector.xpath('/html/body/div[3]/div/table/tr[15]/td/p/text()')
jianli=selector.xpath('/html/body/div[3]/div/table/tr[17]/td/p/text()')
qita=selector.xpath('/html/body/div[3]/div/table/tr[19]/td/p/text()')
dict['课题']=w_d(keti)
dict['获奖']=w_d(huojiang)
dict['近期发表的主要成果']=w_d(jinqi)
dict['主持完成的科研项目']=w_d(zhuchi)
dict['个人简历']=w_d(jianli)
dict['其他']=w_d(qita)
write_down(dict)
def spider_zong(url):
selector = download(url)
xueyuan = selector.xpath('//*[@align="center"]/b/text()')[1:]
detail_url=selector.xpath('//*[@target="_blank"]/@href')[8:]
for j in range(len(detail_url)):
for i in detail_url:
if 'http' in i or '#' in i :
detail_url.remove(i)
for i in detail_url:
order_url='https://yjsb.cjlu.edu.cn/yjsy/daoshi/{}'.format(i)
deep_spider(order_url)
spider_zong('https://yjsb.cjlu.edu.cn/yjsy/daoshi/index.jspx')
总的来说其实就这样,并没有什么新奇的内容