爬取周公解梦主页数据
1.查看周公解梦网站html
2.代码解析
url = 'https://www.zgjm.net/b/jiemeng/'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# json_data = requests.get(url, params=form_data, headers=headers).json()
json_data = requests.get(url)
html=json_data.content
html_doc = str(html,'utf-8')
soup = BeautifulSoup(html_doc,"lxml")
urllist = soup.find_all(class_='postitemjm') #获取类为 postitemjm 的标签
print(urllist)
运行结果:
3.剩下的就是循环取li标签的href,爬取子页面数据结合数据库保存这些数据
def add_data(type_name,param,key_name):
url = 'https://www.zgjm.net'+param
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
try:
json_data = requests.get(url)
html=json_data.content
# print(html)
html_doc = str(html,'utf-8')
soup = BeautifulSoup(html_doc,"lxml")
content = soup.find(class_='article-content')
title = soup.find(class_='article-title')
cur.execute("insert into dream(type,key_name,title,content) VALUES ('"+type_name+"','"+key_name+"','"+str(title.string)+"','"+str(content.text)+"')")
except UnicodeDecodeError as err:
print(key_name,"Unicodeerror")
pass
if __name__ == '__main__':
conn = pymysql.connect(host=ip,port=port,user='root',passwd='',db='',charset='utf8mb4')
# print(conn)
cur = conn.cursor()
#down_meng()
add_data('人物','/b/124/','下雪')
cur.close()
conn.close()
4.最后贴上数据库表结构及所有代码
数据截图:
最后是所有代码:
import json
import requests
import time
from bs4 import BeautifulSoup
import pymysql
def add_data(type_name,param,key_name):
url = 'https://www.zgjm.net'+param
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# json_data = requests.get(url, params=form_data, headers=headers).json()
try:
json_data = requests.get(url)
html=json_data.content
# print(html)
html_doc = str(html,'utf-8')
soup = BeautifulSoup(html_doc,"lxml")
content = soup.find(class_='article-content')
title = soup.find(class_='article-title')
cur.execute("insert into dream(type,key_name,title,content) VALUES ('"+type_name+"','"+key_name+"','"+str(title.string)+"','"+str(content.text)+"')")
except UnicodeDecodeError as err:
print(key_name,"Unicodeerror")
pass
def down_meng():
url = 'https://www.zgjm.net/b/jiemeng/'
headers = {
"User-Agent": 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
# json_data = requests.get(url, params=form_data, headers=headers).json()
json_data = requests.get(url)
html=json_data.content
html_doc = str(html,'utf-8')
soup = BeautifulSoup(html_doc,"lxml")
urllist = soup.find_all(class_='postitemjm')
# print(urllist)
arr = {0:"人物",1:"动物",2:"植物",3:"物品",4:"活动",5:"情感",6:"生活",7:"鬼神",8:"自然",9:"建筑",10:"其他"}
i = 0
for url_list in urllist:
li_list = url_list.find_all('li')
for lilist in li_list:
print(lilist.a.string,"完成")
add_data(arr[i],lilist.a['href'],lilist.a.string)
i = i+1
conn.commit()
time.sleep(3)
if __name__ == '__main__':
conn = pymysql.connect(host=ip,port=port,user='root',passwd='',db=db,charset='utf8mb4')
# print(conn)
cur = conn.cursor()
down_meng()
# add_data('人物','/b/124/','下雪')
cur.close()
conn.close()