方式一:
自动创建数据库和数据库表,如果已存在就不会创建。
特别要注意创建数据库和创建表时,设置默认字符集为utf8。
python代码
import re
import requests
import os
import sys
import time
import random
from xpinyin import Pinyin
import numpy as np
import pymysql
#定义数据库类
class class_database:
def __init__(self, databaseName):
self.databaseName=databaseName
self.host='xxx.xxx.xxx.xxx' #这里填自己服务器的公网IP
self.user='xxx' #这里填服务器上mysql数据库的用户名
self.password='****' #这里填服务器上mysql数据库的密码
self.port=3306 #数据库端口
self.createDB()
#创建数据库
def createDB(self):
mydb = pymysql.connect(
host=self.host,
user=self.user,
passwd=self.password
)
mycursor = mydb.cursor()
mycursor.execute("CREATE DATABASE IF NOT EXISTS "+self.databaseName)
#连接数据库 mysql
def connectDB(self):
#此处添加charset='utf8'是为了在数据库中显示中文,此编码必须与数据库的编码一致
db = pymysql.connect(host=self.host,user=self.user,password=self.password,
db=self.databaseName,port=self.port,charset='utf8')
return db
#创建表,SQL语言。CREATE TABLE IF NOT EXISTS 表示:表不存在时就创建
def createTable(self, tableName, sql):
DB_create=self.connectDB()
cursor_create=DB_create.cursor()
cursor_create.execute(sql)
DB_create.close()
print('creat table '+tableName+' successfully')
#数据插入表中
def inserttable(self,tableName,sql,zone,count,tuple):
#print('params:'+str(params[0])+',end')
#print('params:'+str(params)+',end')
DB_insert=self.connectDB()
cursor_insert=DB_insert.cursor()
t = [count,zone,tuple[0],tuple[1],tuple[2],tuple[3],int(tuple[4]),int(tuple[5]),tuple[6],tuple[7],tuple[8],tuple[9]]
cursor_insert.execute(sql,t)
#cursor_insert.execute(sql,(count,zone,tuple[0],tuple[1],tuple[2],tuple[3],int(tuple[4]),int(tuple[5]),tuple[6],tuple[7],tuple[8],tuple[9]))
DB_insert.commit()
DB_insert.close()
print('inert contents to '+tableName+' successfully')
#end class class_database
#定义爬虫类
class douban_local_activity(object):
def __init__(self, city, activity_tag_lists):
self.division_list = []
self.city = city
p = Pinyin()
self.city_pinyin = p.get_pinyin(city, '')
self.activity_tag_lists = activity_tag_lists
#Some User Agents
self.hds=[{'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'},\
{'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'},\
{'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'},\
{'User-Agent':'Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Trident/6.0)'}]
self.cookies = {'Cookie':'bid=STBb9aHzI4Y; __yadk_uid=ZT5U34Z2UfJW17TSG3EJu33qSJqmh021; _vwo_uuid_v2=D47938034A35EBAE2398C927CAFC2B7E2|559ed3be181cf150f83a1c5e30a23c87; douban-fav-remind=1; gr_user_id=8bee0390-ec32-4d85-9f7d-f19acf2bba2c; ps=y; _ga=GA1.2.1764404011.1524626232; __utmc=30149280; __utmz=30149280.1537237849.21.13.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; ct=y; viewed="1016567_6754416_1962350_1775691_30270830_27608239_27094706_6082808"; ap_v=0,6.0; loc-last-index-location-id="118282"; _pk_ref.100001.8cb4=%5B%22%22%2C%22%22%2C1537345408%2C%22https%3A%2F%2Fbeijing.douban.com%2Fevents%2Ffuture-all-128519%3Fstart%3D20%22%5D; _pk_ses.100001.8cb4=*; __utma=30149280.1764404011.1524626232.1537339546.1537345409.29; _gid=GA1.2.1200789924.1537345415; dbcl2="183532457:8zqxbVkIaFI"; ck=WAZC; push_noty_num=0; push_doumail_num=0; __utmv=30149280.18353; ll="118159"; __utmb=30149280.6.10.1537345409; _pk_id.100001.8cb4=48e28a093264a847.1524626231.29.1537346192.1537342847.'}
print('开始爬取内容')
#获取单页链接html文本
def getSource(self,url):
html = requests.get(url, headers=(self.hds[np.random.randint(0,len(self.hds))]), cookies=self.cookies)
time.sleep(random.random())
html.encoding = 'utf-8'
return html.text
#获取单项链接html文本,返回“类型”
def getDetail(self,url):
html = requests.get(url, headers=(self.hds[np.random.randint(0,len(self.hds))]), cookies=self.cookies)
time.sleep(random.random())
html.encoding = 'utf-8'
source = html.text
try:
type_info = re.findall('<div class="event-detail">\s+<span class="pl">类型.*itemprop="eventType">(.*?)</a>',source,re.S)[0]
except:
type_info = ''
return str(type_info)
#抓取单页所有的活动信息
def getAllEvents(self,source):
try:
biggerHtml = re.search('<ul class="events-list(.*?)<div class="paginator">', source, re.S).group(1)
events = re.findall('(<li class="list-entry".*?</p>\s+</div>\s+</li>)', biggerHtml, re.S)
except:
events=[]
return events
#获取总页数
def getTotalPage(self,source):
try:
total_page = re.findall('<span class="thispage" data-total-page="(.*?)">',source,re.S)[0]
except:
total_page = 1
return total_page
#获取所有行政区域
def getAllDivision(self,source):
div_list = []
try:
biggerHtml = re.search('<div class="events-nav-item events-filter-loc">(.*?)<div class="events-nav-item">', source, re.S).group(1)
all_division = re.findall('(<li><a href=".*?</a></li>)', biggerHtml, re.S)
for item in all_division:
d_name = re.search('<li><a href=".*">\s+(.*?)\s+</a></li>',item,re.S).group(1)
d_url = re.search('<li><a href="(.*?)">',item,re.S).group(1)
print('d_name:'+d_name+', d_url:'+d_url)
if "全部" != d_name:
div_list.append([d_name, d_url])
except:
pass
return div_list
#获取每个活动的详细信息
def getEntity(self,event,division):
entity_list =[]
title = re.search('<span itemprop="summary">(.*?)</span>',event,re.S).group(1)
time = re.search('时间:</span>\s+(.*?)\s+<time',event,re.S).group(1)
position = re.search('<li title="(.*?)">\s+<span',event,re.S).group(1)
fee = re.search('<strong>(.*?)</strong>',event,re.S).group(1)
#发起方
owner = re.search('<a target="db-event-owner" href=".*">(.*?)</a>',event,re.S).group(1)
#参加人数 感兴趣人数 区域
join = re.search('<p class="counts">\s+<span>(.*?)人参加</span>',event,re.S).group(1)
interested = re.search('<span class="pipe"></span> <span>(.*?)人感兴趣</span>',event,re.S).group(1)
#url
detail_url = re.search('<div class="pic">\s+<a href="(.*?)">\s+<img data-lazy="',event,re.S).group(1)
url = detail_url
type_info = self.getDetail(detail_url)
entity_list.append([title,fee,type_info,time,join,interested,division,position,owner,detail_url])
return entity_list
#保存到服务器的mysql数据库中
def save2mysql(self,activity_lists,activity_tag_lists):
db_name='douban_activity'
table_name=self.city_pinyin
#实例化class_database()对象,创建数据库
cls_db=class_database(db_name)
sql_create_table = str('create table if not exists ')+str('%s' % table_name)+\
str('(count INT PRIMARY KEY NOT NULL,zone CHAR(32),title CHAR(128),fee CHAR(32),type_info CHAR(32),time CHAR(64),join_cnt INT,interested INT,division CHAR(16),position CHAR(64),owner CHAR(32),detail_url CHAR(64)) DEFAULT CHARSET=utf8;')
#创建表
cls_db.createTable(table_name, sql_create_table)
p = Pinyin()
#插入数据
for i in range(len(activity_tag_lists)):
for j in range(len(self.division_list)):
count=1
div = self.division_list[j]
for bl in activity_lists[j]:
sql_insert = str('insert into ')+str('%s' % table_name)+\
str('(count,zone,title,fee,type_info,time,join_cnt,interested,division,position,owner,detail_url)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s);')
cls_db.inserttable(table_name, sql_insert, p.get_pinyin(div[0], ''),count,bl)
count+=1
#主函数
def main(self):
event_lists=[]
#第一次请求url,获取行政区域的url,删除“全部”项
#url = 'https://shenzhen.douban.com/events/future-all'
url = 'https://www.douban.com/location/'+self.city_pinyin+'/events/future-all'
print('第一次请求url:' + url)
html = self.getSource(url)
self.division_list = self.getAllDivision(html)
#分行政区域进行处理
for div in self.division_list:
division_events = []
d_name = div[0]
url = div[1]
for tag in self.activity_tag_lists:
#截取替换"类型",默认为“全部”
tag_url=url
tag_url = tag_url.split("-")[0]+str("-"+tag[1]+"-")+tag_url.split("-")[-1]
#print('tag_url:%s'%tag_url)
first = 1
total_page = 1
i=0
while i<total_page:
link = tag_url+str('?start=%s' % (i*10))
print('正在处理页面:' + link)
html = self.getSource(link)
if 1 == first:
total_page = (int)(self.getTotalPage(html))
print('总页数:%d'%(total_page))
first = 0
#保存数据
allEvents = self.getAllEvents(html)
for item in allEvents:
entity = self.getEntity(item,d_name)
division_events.extend(entity)
i=i+1
print('tag=%s, division_events len=%d'%(tag[0],len(division_events)))
if len(division_events)>0:
division_events=sorted(division_events,key=lambda x:x[4],reverse=True)
event_lists.append(division_events)
#保存到服务器的mysql数据库中
self.save2mysql(event_lists,self.activity_tag_lists)
#end class douban_local_activity(object)
#---------------------------------------------
if __name__ == '__main__':
activity_tag_lists=[]
activity_tag_lists.append(["全部", "all"])
'''
activity_tag_lists.append(["音乐", "music"])
activity_tag_lists.append(["戏剧", "drama"])
activity_tag_lists.append(["讲座", "salon"])
activity_tag_lists.append(["聚会", "party"])
activity_tag_lists.append(["电影", "film"])
activity_tag_lists.append(["展览", "exhibition"])
activity_tag_lists.append(["运动", "sports"])
activity_tag_lists.append(["公益", "commonweal"])
activity_tag_lists.append(["旅行", "travel"])
activity_tag_lists.append(["赛事", "competition"])
activity_tag_lists.append(["课程", "course"])
activity_tag_lists.append(["亲子", "kids"])
activity_tag_lists.append(["其他", "others"])
'''
city_lists = ['温州']#深圳 梧州 兰州 福州 武汉 南京 上海 温州
for city in city_lists:
activity = douban_local_activity(city, activity_tag_lists)
activity.main()
方式二:
前提:已创建test数据库和dongman表结构
python代码
# -*- coding: utf-8 -*-
import os,sys
import requests
import bs4
import pymysql#import MySQLdb
#--------------
#前提:已创建test数据库和dongman表结构
#--------------
#连接MYSQL数据库
db = pymysql.connect(host='x',user='rootxx.xxx.xxx.xxx',password='root***',db='test',port=3306,charset='utf8')
#db = MySQLdb.connect('127.0.0.1','root','mysql','test',coon.set_character_set('utf8'))
print('连接数据库成功!')
conn = db.cursor() #获取指针以操作数据库
conn.execute('set names utf8')
html = 'https://www.dongmanmanhua.cn/dailySchedule?weekday=MONDAY'
result = requests.get(html)
texts = result.text
data = bs4.BeautifulSoup(texts,'html.parser');
lidata = data.select('div#dailyList ul.daily_card li')
#print(lidata)
arr = {}
for x in lidata:
did = x.get('data-title-no')
print(did)
name = x.select('p.subj')
name1 = name[0].get_text()
url = x.a.get('href')
#print(url)
story = x.a.p
story1 = story.string
user = x.select('p.author')
user1 = user[0].get_text()
like = x.select('em.grade_num')
like1 = like[0].get_text()
#写入MYSQL数据库
t = [did,name1,url,story1,user1,like1]
sql = u"INSERT INTO dongman(did,name,url,story,user,likes) VALUES(%s,%s,%s,%s,%s,%s)"
conn.execute(sql,t)
# t1 = (did,name1,url,story1,user1,like1)
# sql1 = u'''insert into dongman(did,name,url,story,user,likes) values (%d,'%s','%s','%s','%s','%s')''' % t1
# conn.execute(sql1)
db.commit() #提交操作
print('插入数据成功!')
#关闭MySQL连接
conn.close()
db.close()
报错问题解决方法
1.pymysql.err.DataError: (1366, “Incorrect string value: ‘\\xE7\\x9A\\x84\\xE5\\x8F\\x91...‘ for colum
对数据库和表进行字符集编辑
2.pymysql.err.IntegrityError: (1062, “Duplicate entry ‘ ‘ for key ‘PRIMARY‘“)
主键重复导致,可以修改数据库表不设置主键。
通过navicat for mysql客户端查看成功写入数据库表中的数据: