本来是想爬取某网站的古诗词,但是这个网站的有限制,只能爬取十页的数据,再多就接口500,然后发现该网站有app端,然后通过fiddler抓取接口,爬取了十万左右的古诗词。
通过python的正则表达式爬取的,没用框架,而且设计表的时候也有一些问题不过嫌麻烦没优化,话不多说上代码。
因为当初写了好几个忘了哪个是全的了,然后找了个差不多的粘贴上了。
最后把整个app的诗词曲四书五经等等,翻译注释赏析全爬取出来了。
import re,os
import pymysql
import requests
import random
import time
requests.packages.urllib3.disable_warnings()
proxy_list = [{
'http': '118.113.246.131:9999'}, {
'http': '36.249.109.18:9999'}, {
'http': '114.104.142.65:9999'},
{
'http': '113.128.31.217:9999'},
{
'http': '171.12.112.155:9999'}, {
'http': '1.197.16.218:9999'}, {
'http': '182.34.36.100:9999'},
{
'http': '36.249.119.34:9999'}, {
'http': '175.43.59.4:9999'}, {
'http': '113.124.87.65:9999'},
{
'http': '125.108.81.211:9999'},
{
'http': '175.42.68.194:9999'}, {
'http': '183.166.97.166:9999'}, {
'http': '180.118.128.112:9000'},
{
'http': '60.13.42.151:9999'}, {
'http': '182.149.83.194:9999'},
{
'http': '60.205.132.71:80'}, {
'http': '120.79.64.147:8118'}, {
'http': '121.232.194.144:9000'},
{
'http': '171.35.160.55:9999'},
{
'http': '36.248.129.82:9999'}, {
'http': '171.15.48.137:9999'}, {
'http': '163.204.245.210:9000'},
{
'http': '117.88.5.116:3000'},
{
'http': '144.123.71.3:9999'}, {
'http': '125.108.81.211:9999'}, {
'http': '120.234.138.102:53779'},
{
'http': '175.42.68.194:9999'}, {
'http': '120.83.105.247:9999'},
{
'http': '112.111.217.56:9999'}]
def get_json(url):
proxy = random.choice(proxy_list)
response = requests.get(url, verify=False, proxies=proxy)
if response.status_code==200:
return response.json()
else:
while(response.status_code!=200):
print(str(response.status_code))
print('等待1秒..')
time.sleep(1)
proxy = random.choice(proxy_list)
response = requests.get(url, verify=False, proxies=proxy)
return response.json()
def get_Yijson(url):
proxy = random.choice(proxy_list)
response = requests.get(url, verify=False, proxies=proxy)
if response.status_code==200:
return response.json()
else:
return ''
#爬取作者姓名,和生平
def get_author():
authorList = []
for i in range(1,101):
json = get_json('https://app.gushiwen.cn:443/api/author/Default10.aspx?c=&page='+ str(i