import requests
from lxml import html
import docx
from docx.shared import Pt
result = ""
resultList = []
# 爬取青书学堂答案
def Crawler_Answer():
# 目标地址
url = "https://degree.qingshuxuetang.com/lnsh/Student/ViewExerciseAnswer?courseId=31&exerciseId=622edd58d6018000016ca3b7&teachPlanId=403&periodId=19"
# 伪装为本地
header = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36",
"Cookie": "__environment=production; AccessToken=eyJ0eXAiOiJKV1QiLCJhbGciOiJIUzI1NiJ9.eyJ1SWQiOjMzNTIxNzMxLCJyb2xlIjoxMDAsImNsaWVudCI6InBjd2ViIiwib3JncyI6WyJ7XCJ0eXBlXCI6XCJkZ1wiLFwicm9sZXNcIjpbMV0sXCJpZFwiOjExNCxcIm91SWRcIjoxMzQ2MX0iXSwiZXhwIjoxNjYzMjM3NzU3LCJqdGkiOiJqd3Q1NTAwOWNiNDU5NDU0YTNlOThhN2NmMmVhZDY0Nzk3MyIsInBsYXRmb3JtIjoicXN4dCJ9.fUsLBHfDgj8IBAVv7w-dP8YyihSv3wImEQK9UMQB9fc"
}
page = requests.Session().get(url, headers=header)
tree = html.fromstring(page.text)
# result = tree.xpath('//h2/text()')
# 通过循环找到每个题的答案
# i:题数 xpath网页抓取 /text() 转文字
for i in range(1, 21):
result = tree.xpath(
'//*[@id="form1"]/div['+str(i)+']/div[3]/div/span'+'/text()')
resultList.append(result)
'''
//*[@id="form1"]/div[2]/div[3]/div/p/span
//*[@id="form1"]/div[3]/div[3]/div/p/span
//*[@id="form1"]/div[8]/div[3]/div/p[1]/span[1]
//*[@id="form1"]/div[1]/div[3]/span[2]/div/span
//*[@id="form1"]/div[40]/div[3]/span[2]/div/span
//*[@id="form1"]/div[40]/div[3]/span[2]/div/span
//*[@id="form1"]/div[1]/div[4]/span[2]
//*[@id="form1"]/div[80]/div[4]/span[2]
//*[@id="form1"]/div[1]/div[3]/div/span
//*[@id="form1"]/div[20]/div[3]/div/span
document.querySelector("#app > div > div.pdf-container > span > canvas")
'''
# print(result)
# 将答案写入doc
def writeDocx():
file = docx.Document()
i = 0
while i < len(resultList):
file.add_paragraph(resultList[i]).space_after = Pt(30)
i = i + 1
file.save(f"E:\QSXTAnswer\QSXTAnswer.docx")
print("success")
Crawler_Answer()
writeDocx()
爬虫初学者。
于 2022-06-18 17:58:16 首次发布