一、PyCharm导入selenium,参考下面文章
https://blog.csdn.net/u010381752/article/details/98955424
二、引入:
from selenium import webdriver
三、初始驱动
dr = 0
# 登录网易邮箱
def init_driver():
# 新建Chrome览器驱动
global dr
chrome_driver = r'venv\Scripts\chromedriver.exe'
# dr = webdriver.Chrome()
dr = webdriver.Chrome(executable_path=chrome_driver)
return
四、采文章
# 取出文章内容并写入数据库,并返回文章ID用于添加相关评论
def AddArticleToData(resultDict):
url = resultDict['url']
cate_id = resultDict['cate_id']
param_cookie = resultDict['param_cookie']
# 提取aid
start = url.find("group/")
end = url.find("/", start + 7)
aid = '';
aid = url[start + 6:end]
if aid.strip() == '':
resultDict['err'] = "aid null"
return False
# 重新组装url
time_stamp = int(time.time())
url = "https://www.toutiao.com/a%s/?timestamp=%s&app=news_article&group_id=%s" % (aid,time_stamp,aid)
dr.get(url)
str_html = dr.page_source
searchObj = re.search(r'var BASE_DATA = ({[\s\S]*?});', str_html, re.M | re.I)
groupsTemp = searchObj.groups() #groups从1开始
lenTemp = len(groupsTemp)
if ( lenTemp <= 0):
resultDict['err'] = "匹配文章时出错"
return False
content_temp = searchObj.group(1)
# print(contentTemp1)
# 转成json对象
# jsonText = json.loads(str_json)
content_temp = re.sub(r'\.\w+?\(.*?\)', '', content_temp)# 要替换掉.slice(6, -6)
json_text = demjson.decode(content_temp)
# 检测是否有相同aid
if IsTitleRepeat(aid):
resultDict['err'] = "txt:文章重复了哦!"
return False
# 检测是否有相同标题
news_title = json_text["articleInfo"]["title"].strip('"')
news_title = html.unescape(news_title)
if IsTitleRepeatInDb(news_title):
resultDict['err'] = "db:文章重复了哦!"
return False
news_content = json_text["articleInfo"]["content"].strip('"')
news_content = html.unescape(news_content)
# 在内容中取出图片地址下载
src_imgs = re.findall(r'<img src="(.*?)".*?>', news_content)
# if src_imgs:
# for image in src_imgs:
# print(image)
focus_img = []
for i in range(len(src_imgs)):
imgUrl = src_imgs[i]
imgUrl = imgUrl + '.jpg'
# 下载图片并返回本地图片地址
img_loc = downloadIma(imgUrl)
img_loc = img_loc.replace('\\', '/')
img_loc = img_loc.replace(r'./', r'/')
# if i==0:
img_temp = img_loc
img_temp = img_temp.replace('/data/attachment/tomwx', '')
focus_img.append(img_temp) # 去掉前面 的/[1:] 列表页不需要前缀/data/attachment/tomwx
# imgUrl = '<p style="margin: 15px 0;text-align: center;"><img src="%s"></p>' % imgUrl
imgUrl = imgUrl.replace(r'.jpg', '')
news_content = news_content.replace(imgUrl, img_loc)
news_content = news_content.replace('<p>', '<p style="margin: 25px 0;">')
# news_content = news_content.replace('<p><img', '<p style="margin: 15px 0;text-align: center;"><img')
news_content = news_content.replace('<img', '<img style="display:block;margin: 15px auto;"')
# 插入到数据
# 获取游标
global conn
cursor = conn.cursor()
# 2数据库中插入数据
time_stamp = int(time.time())
virtrue_click = random.randint(35,178)
sql_insert = "INSERT INTO ..."
# 执行语句
cursor.execute(sql_insert)
lastRowId = cursor.lastrowid
# 事务提交,否则数据库得不到更新
conn.commit()
# print(cursor.rowcount)
# 更新主图 可插入多张 要么1张 要么3张
temp_list = []
for i in range(len(focus_img)):
temp_list_sub = (lastRowId, '1', focus_img[i], '', '1', None, '0', None, '100000', time_stamp, None, None, None)
temp_list.append(temp_list_sub)
if i >= 2:
break
sql_insert = "INSERT INTO ..."
# 执行语句
# cursor.execute(sql_insert)
cursor.executemany(sql_insert, temp_list)
conn.commit()
# conn.close()
cursor.close()
# 记录aid,写入txt
filehand = open('data/newsposted.txt', 'a')
filehand.write('%s\n' % aid)
filehand.close()
resultDict['lastRowId'] = lastRowId
return True
五、采集评论
# GetComments 获取评论列表
def AddCommentToData(aid,article_id):
global conn
# 获取游标
cursor = conn.cursor()
time_stamp = int(time.time() * 1000)
xss_item_id = 13582543483546817534
xss_item_id = aid
randCount = random.randint(8, 15)
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3741.400 QQBrowser/10.5.3863.400'
}
count_max = 10
offset_temp = 0
db_num = randCount / count_max
int_num = int(randCount / count_max)
loop_count = int_num
if db_num > int_num:
int_num += 1
loop_count = int_num
if loop_count<=0:
loop_count = 1
# 循环读取loop_count次。合并后,再取指定次数
comments = []
for loop_i in range(loop_count):
offset_temp = loop_i * count_max
source_url = 'https://www.toutiao.com/article/v2/tab_comments/?app_name=toutiao-web&group_id=%s&item_id=%s&count=%s&offset=%s' % (
aid,aid, count_max, offset_temp)
resp = requests.get(source_url, headers=headers).json()
comments.extend(resp['data'])
if randCount>= len(comments):
randCount = len(comments)
comments = comments[:randCount]
comments.reverse()
strResult = ''
for comment in comments:
data = comment
# 评论内容text
text = data['comment']['text']
# 过滤emoji字符
text = emoji.demojize(text)
text = re.sub(r':(.*?):', '', text).strip() #riddof special char
text = re.sub(r'\[.{,8}\]', '', text) # riddof [表情] code
# 2数据库中插入数据
time_stamp = int(time.time())
# 马甲用户:id范围 6 - 226
# majiaid = random.randint(6,217)
majiaid = random.randint(0, len(list_majia))
majiaid = list_majia[majiaid][0]
sql_insert = "INSERT INTO `...."
# 执行语句
cursor.execute(sql_insert)
lastRowId = cursor.lastrowid
# 事务提交,否则数据库得不到更新
conn.commit()
# # 插入点赞
#...
cursor.close()
return True
文章和评论就采集好了。以上是必要代码,仅参考!