#http://www.18ladys.com/post/buchong/
#User-Agent: Mozilla/5.0 (Windows NT 1HTML,0.0; Win64; x64) AppleWebKit/537.36 (K like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50
#https://www.daquan.com/post/2411.html
from bs4 import BeautifulSoup
from lxml import etree
import requests
from splinter import Browser
from selenium import webdriver
import urllib
import time
import os
import re
import base64
from splinter import Browser
import sys
import urllib.request
headers = {
"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50"
}
time.sleep(4)
'''
for page in range(2411,2423):
new_url = ['https://www.daquan.com/post/{}.html'.format(page)]
new_url = new_url[0]
print("正在爬取第"+page+"页")
response = requests.get(new_url, headers=headers)
response.encoding = 'utf-8'
jx = BeautifulSoup(response.content, 'lxml')
'''
num = 0
out_text = []
for page in range(2411,2423):
new_url = ['https://www.daquan.com/post/{}.html'.format(page)]
new_url = new_url[0]
time.sleep(3)
page = urllib.request.urlopen(new_url).read()
soup = BeautifulSoup(page, 'html.parser')
middle = soup.find('div',{'class':'article'})
save_file_name = (middle.find('h1')).get_text()
contents = middle.find_all('p')
contents.encoding = 'utf-8'
for item in contents:
out_text.append(item.get_text()+'\n')
with open(save_file_name+".txt","w") as f:
f.write(str(out_text))
f.close()
num+=1
print("正在爬取"+str(num)+"的内容")
print("全部爬取")
爬取中草药网站
最新推荐文章于 2024-06-09 11:01:50 发布