爬取中草药网站

最新推荐文章于 2023-06-15 22:25:37 发布

灯繁

最新推荐文章于 2023-06-15 22:25:37 发布

阅读量411

点赞数

文章标签： python

本文链接：https://blog.csdn.net/weixin_52300580/article/details/114778867

版权

#http://www.18ladys.com/post/buchong/
#User-Agent: Mozilla/5.0 (Windows NT 1HTML,0.0; Win64; x64) AppleWebKit/537.36 (K like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50
#https://www.daquan.com/post/2411.html
from bs4 import BeautifulSoup
from lxml import etree
import requests
from splinter import Browser
from selenium import webdriver
import urllib
import time
import os
import re
import base64
from splinter import Browser
import sys
import urllib.request

headers = {
    "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50"
}
time.sleep(4)
'''
for page in range(2411,2423):
    new_url = ['https://www.daquan.com/post/{}.html'.format(page)]
    new_url = new_url[0]
    print("正在爬取第"+page+"页")
    response = requests.get(new_url, headers=headers)
    response.encoding = 'utf-8'
    jx = BeautifulSoup(response.content, 'lxml')
'''

num = 0
out_text = []
for page in range(2411,2423):
    new_url = ['https://www.daquan.com/post/{}.html'.format(page)]
    new_url = new_url[0]
    time.sleep(3)
    page = urllib.request.urlopen(new_url).read()
    soup = BeautifulSoup(page, 'html.parser')
    middle = soup.find('div',{'class':'article'})
    save_file_name = (middle.find('h1')).get_text()
    contents = middle.find_all('p')
    contents.encoding = 'utf-8'
    for item in contents:
        out_text.append(item.get_text()+'\n')
    with open(save_file_name+".txt","w") as f:
        f.write(str(out_text))
    f.close()
    num+=1
    print("正在爬取"+str(num)+"的内容")
print("全部爬取")

灯繁

关注

0
点赞
踩
4

收藏

觉得还不错? 一键收藏
0
评论
爬取中草药网站

#http://www.18ladys.com/post/buchong/#User-Agent: Mozilla/5.0 (Windows NT 1HTML,0.0; Win64; x64) AppleWebKit/537.36 (K like Gecko) Chrome/89.0.4389.82 Safari/537.36 Edg/89.0.774.50#https://www.daquan.com/post/2411.htmlfrom bs4 import BeautifulSoupfrom
复制链接

扫一扫