from bs4 import BeautifulSoup
import urllib.request
import urllib.parse
from urllib.request import urlopen
import requests
from lxml import etree
# def load_page(url):
# headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
# request=urllib.request.Request(url,headers=headers)
# return urllib.request.urlopen(request).read().decode('utf-8')
def write_pagew(file1):
with open('pachong.txt','w',encoding='utf-8') as file:
file.write(file1)
file.close()
def write_pagea(file1):
with open('pachong.txt','a',encoding='utf-8') as file:
file.write(file1)
file.close()
def headers_page():
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
return request,response
def tran_url(url):
xiuzheng_url = [ ]
for xiuzheng in url:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
# print('问题1')
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
# print('问题2')
else:
print('没啥问题')
print(xiuzheng_url)
return xiuzheng_url
def huoquyeshu_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# html = response.read().decode('utf-8')
html=response.read().decode('utf-8')
s = etree.HTML(html)
result=s.xpath('//*[@id="fanye126908"]/text()')
# print(result)
# print(type(result))
# print(len(result))
result_str = [ ]
for c in result:
result_str.append(c)
# print(c)
# print(len(c))
yeshu = c[ -4:-1 ]
# print(yeshu)
return yeshu
def neirong(url):
# url = url
if url == '':
pass
else:
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
res = requests.get(url,headers=headers)
res.encoding = 'utf-8'
soup = BeautifulSoup(res.text, 'html.parser')
biaoti = soup.select('.zw-title')[0].text
# zuozhe = soup.select('.zz')[ 0 ].text
time = soup.select('.zw-other')[0].text
result = {}
# author = zuozhe.split()[ 0 ].lstrip('作者:')
# resource = zuozhe.split()[ 1 ].lstrip('来源:')
timesource = time.split('浏')[ 0 ].strip().lstrip('发布时间:')
result[ 'title' ] = biaoti
# result[ 'author' ] = author
# result[ 'resource' ] = resource
result[ 'timesource' ] = timesource
result[ 'article' ] = ' '.join([ p.text.strip() for p in soup.select('.zw p')[ :-1 ] ])
print(result)
def shouye_spider():
url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
print('首页是' + url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# html = response.read().decode('utf-8')
html = response.read().decode('utf-8')
s = etree.HTML(html)
for d in range(0, 20):
# r = urlopen(url)
# data = requests.get(url)
# data.encoding = 'utf-8'
# s = etree.HTML(data.text)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# html = response.read().decode('utf-8')
html = response.read().decode('utf-8')
s = etree.HTML(html)
r = urlopen(request)
result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_url)
# print(result_pachong)
result_pachong = s.xpath(result_xpath_url)
result_pachong = tran_url(result_pachong)
neirong("".join(result_pachong))
def fanye_page(yeshu, paquyeshu):
for b in range(eval(yeshu) - paquyeshu + 1, eval(yeshu)):
url = 'http://www.zjvtit.edu.cn/xwzx/jyxw/'
data = b
url = url + str(data) + '.htm'
print('第' + str(eval(yeshu) - b) + '页是' + url)
s = etree.HTML(url)
for d in range(0, 20):
# r = urlopen(url)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'}
request = urllib.request.Request(url, headers=headers)
response = urllib.request.urlopen(request)
# html = response.read().decode('utf-8')
html = response.read().decode('utf-8')
s = etree.HTML(html)
r = urlopen(request)
# data = requests.get(url)
# data.encoding = 'utf-8'
# s = etree.HTML(data.text)
# print(r.read().decode('utf-8'))
result_xpath_biaoti = '//*[@id="line_u9_' + str(d) + '"]' + '/a/text()'
result_xpath_url = '//*[@id="line_u9_' + str(d) + '"]' + '/a/@href'
result_pachong = s.xpath(result_xpath_url)
xiuzheng_url = [ ]
# print('标记')
# print(type(result_pachong))
for xiuzheng in result_pachong:
if xiuzheng.startswith('../..'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../..'))
# print('问题1')
elif xiuzheng.startswith('../'):
xiuzheng_url.append('http://www.zjvtit.edu.cn//' + xiuzheng.lstrip('../'))
# print('问题2')
else:
print('没啥问题')
print(xiuzheng_url)
neirong("".join(xiuzheng_url))
# result_pachong=s.xpath(result_xpath_biaoti)
# print(result_pachong)
# global back_urlandbiaoti
# back_urlandbiaotik=result_xpath_url
if __name__ == '__main__':
url = 'http://www.zjvtit.edu.cn/xwzx/jyxw.htm'
print('正在爬取浙江交通职业技术学院相关信息')
paquyeshu = eval(input('请输入要爬取多少页:'))
# huoquyeshu_page(url)
fanye_page(huoquyeshu_page(url), paquyeshu)
shouye_spider()
# load_page(url)
# print(load_page(url))
交院1
最新推荐文章于 2021-11-03 10:41:30 发布