一、抓取数据
# -*- coding:utf-8 -*-
import json
import sys
import base64
from typing import Optional
import requests
from pydantic import BaseModel
from lxml import etree
from loguru import logger
logger.remove()
logger.add(
sys.stdout,
colorize=True,
diagnose=False,
format='<green>{time:MM-DD HH:mm:ss}</green>|{level}|<yellow>{name}:{line}</yellow>|<level>{message}</level>')
class Article(BaseModel):
title: Optional[str]
category: Optional[str]
tag: Optional[str]
author: Optional[str]
content: Optional[str]
date: Optional[str]
class LikeCs:
def __init__(self):
self.session = requests.session()
self.articles = {}
self.host = 'https://www.likecs.com'
def get_tree(self, url):
resp = self.session.get(url)
return etree.HTML(resp.text)
def get_category(self, page_url):
logger.info('正在获取页面分类:{}'.format(page_url))
tree = self.get_tree(page_url)
lst = []
for li in tree.xpath('//ul[@class="cardcont navsubs"]/li'):
category_url = self.host + li.xpath('./a/@href')[0]
category = li.xpath('./a/text()')[0].strip()
item = {
'name': category,
'url': category_url
}
self.articles[category] = []
logger.info(item)
lst.append(item)
logger.info('页面:{} 分类获取完毕!'.format(page_url))
return lst
def get_list(self, category):
tree = self.get_tree(category['url'])
# pages = tree.xpath()
pages = 25
for page in range(1, pages + 1):
logger.info('正在爬取分类:{} 当前第{}页,总共{}页!'.format(category['name'], page, pages))
if page == 1:
tree2 = tree
else:
tree2 = self.get_tree(category['url'] + '?page={}'.format(page))
for item in tree2.xpath('//article[@class="post tag-uifont tag-webview"]'):
article = Article()
article.category = category['name'].strip()
try:
article.title = item.xpath('./header/h2/a/text()')[0].replace('\n', '').strip()
except:
article.title = ''
try:
article.author = item.xpath('./footer/text()')[0].strip()
except:
article.author = ''
try:
article.tag = item.xpath('./footer/a/text()')[0].strip()
except:
article.tag = ''
try:
article.date = item.xpath('./footer/time/text()')[0].strip()
except:
article.date = ''
article_url = self.host + item.xpath('./header/h2/a/@href')[0]
a = self.get_article(article, article_url)
logger.info('文章:{} 爬取成功!'.format(a.title))
self.articles[a.category].append(a.dict())
# self.publish_article_to_wordpress(a)
def get_article(self, article: Article, article_url):
tree = self.get_tree(article_url)
article.content = tree.xpath('//section[@class="post-content"]')[0].xpath('string(.)').strip()
return article
def main(self):
page_url = [
'https://www.likecs.com/nav/1.html',
# 'https://www.likecs.com/nav/9.html',
# 'https://www.likecs.com/nav/31.html',
# 'https://www.likecs.com/nav/41.html',
# 'https://www.likecs.com/nav/52.html',
# 'https://www.likecs.com/nav/68.html',
# 'https://www.likecs.com/nav/72.html'
]
for url in page_url:
for category in self.get_category(url)[0:1]:
self.get_list(category)
logger.debug('-' * 50)
logger.debug('-' * 50)
with open('./article.json', 'w') as f:
json.dump(self.articles, f, ensure_ascii=False)
二、配置wordpress
WordPress配置
if (-f $request_filename/index.html){
rewrite (.*) $1/index.html break;
}
if (-f $request_filename/index.php){
rewrite (.*) $1/index.php;
}
if (!-f $request_filename){
rewrite (.*) /index.php;
}
rewrite /wp-admin$ $scheme://$host$uri/ permanent;
WordPress插件
WP REST API
WordPress REST API Authentication
三、发布到WordPress中
# 发布文章到WordPress
def publish_article_to_wordpress(article: Article):
session = requests.session()
# 填写WordPress域名,管理端账号密码
wp_host = 'xxx.xxx.xxx.xxx'
wp_username = 'admin'
wp_password = '123456'
basic_auth = str(base64.b64encode('{user}:{passwd}'.format(user=wp_username, passwd=wp_password).encode()),
'utf-8')
api = 'http://{wp_host}/wp-json/wp/v2/posts'.format(wp_host=wp_host)
headers = {
'Authorization': 'Basic {basic_auth}'.format(basic_auth=basic_auth)
}
data = {
'title': article.title,
'content': article.content,
'categories': [3],
'status': 'publish'
}
resp = session.post(api, headers=headers, data=data)
if resp.status_code == 201:
logger.info('文章:{} 发布成功!'.format(article.title))
else:
logger.error('文章:{} 发布发布失败!{}'.format(article.title, resp.json()))