网页爬虫设计
项目驱动,需要从网站上爬取文章,并上传至服务器,实现模拟用户发帖。
框架采用Python3,配合爬虫框架Scrapy实现,目前只能抓取静态页,JS+Ajax动态加载的网页见下一篇博客
GitHub地址:https://github.com/JohonseZhang/Scrapy-Spider-based-on-Python3
求Star~
另外,爬取类似今日头条、淘宝、京东等动态加载网站的需要配合selenium和phantomjs框架:
[GitHub地址]:https://github.com/JohonseZhang/python3-scrapy-spider-phantomjs-selenium
求Star~求Star~求Star~
项目结构
代码结构图:
创建项目
- 进入指定文件夹,右击空白处>在此处打开命令行窗口
- 创建项目
Scrapy startproject DgSpider
- 1
主要代码文件说明
- 爬虫主类 :UrlSpider.py、ContentSpider.py
项目包含2个爬虫主类,分别用于爬取文章列表页所有文章的URL、文章详情页具体内容 - 内容处理类 :pipelines.py
处理内容 - 传输字段类 :items.py
暂存爬取的数据 - 设置文件 :settings.py
用于主要的参数配置 - 数据库操作:mysqlUtils.py
链接操作数据库
代码实现
- UrlSpider.py
# -*- coding: utf-8 -*-
import scrapy
from DgSpider.items import DgspiderUrlItem
from scrapy.selector import Selector
from DgSpider import urlSettings
class DgUrlSpider(scrapy.Spider):
print('Spider DgUrlSpider Staring...')
# 爬虫名 必须静态指定
# name = urlSettings.SPIDER_NAME
name = 'DgUrlSpider'
# 设定域名
allowed_domains = [urlSettings.DOMAIN]
# 爬取地址
url_list = []
"""一般来说,列表页第一页不符合规则,单独append"""
url_list.append(urlSettings.START_LIST_URL)
loop = urlSettings.LIST_URL_RULER_LOOP
for i in range(1, loop):
url = urlSettings.LIST_URL_RULER_PREFIX + str(i) + urlSettings.LIST_URL_RULER_SUFFIX
url_list.append(url)
start_urls = url_list
# 爬取方法
def parse(self, response):
# sel : 页面源代码
sel = Selector(response)
item_url = DgspiderUrlItem()
url_item = []
# XPATH获取url
url_list = sel.xpath(urlSettings.POST_URL_XPATH).extract()
# 消除http前缀差异
for url in url_list:
url = url.replace('http:', '')
url_item.append('http:' + url)
# list去重
url_item = list(set(url_item))
item_url['url'] = url_item
yield item_url
- ContentSpider.py
# -*- coding: utf-8 -*-
import scrapy
from DgSpider.mysqlUtils import dbhandle_geturl
from DgSpider.items import DgspiderPostItem
from scrapy.selector import Selector
from scrapy.http import Request
from DgSpider import contentSettings
from DgSpider import urlSettings
from DgSpider.mysqlUtils import dbhandle_update_status
class DgContentSpider(scrapy.Spider):
print('Spider DgContentSpider Staring...')
result = dbhandle_geturl(urlSettings.GROUP_ID)
url = result[0]
spider_name = result[1]
site = result[2]
gid = result[3]
module = result[4]
# 爬虫名 必须静态指定
# name = contentSettings.SPIDER_NAME
name = 'DgContentSpider'
# 设定爬取域名范围
allowed_domains = [site]
# 爬取地址
# start_urls = ['http://www.mama.cn/baby/art/20140829/774422.html']
start_urls = [url]
start_urls_tmp = []
"""构造分页序列,一般来说遵循规则 url.html,url_2.html,url_3.html,并且url.html也写为url_1.html"""
for i in range(6, 1, -1):
start_single = url[:-5]
start_urls_tmp.append(start_single+"_"+str(i)+".html")
# 更新状态
"""对于爬去网页,无论是否爬取成功都将设置status为1,避免死循环"""
dbhandle_update_status(url, 1)
# 爬取方法
def parse(self, response):
item = DgspiderPostItem()
# sel : 页面源代码
sel = Selector(response)
item['url'] = DgContentSpider.url
# 对于title, <div><h1><span aaa><span>标题1</h1></div>,使用下列方法取得
data_title_tmp = sel.xpath(contentSettings.POST_TITLE_XPATH)
item['title'] = data_title_tmp.xpath('string(.)').extract()
item['text'] = sel.xpath(contentSettings.POST_CONTENT_XPATH).extract()
yield item
if self.start_urls_tmp:
url = self.start_urls_tmp.pop()
yield Request(url, callback=self.parse)
- pipelines.py
# -*- coding: utf-8 -*-
# Define your item pipelines here
# If you have many piplelines, all should be init here
# and use IF to judge them
#
# DOUGUO Spider pipelines
# @author zhangjianfei
# @date 2017/04/13
import re
import urllib.request
from DgSpider import urlSettings
from DgSpider import contentSettings
from DgSpider.mysqlUtils import dbhandle_insert_content
from DgSpider.uploadUtils import uploadImage
from DgSpider.mysqlUtils import dbhandle_online
from DgSpider.mysqlUtils import dbhandle_update_status
from bs4 import BeautifulSoup
from DgSpider.PostHandle import post_handel
from DgSpider.commonUtils import get_random_user
from DgSpider.commonUtils import get_linkmd5id
class DgPipeline(object):
# post构造reply
cs = []
# 帖子title
title = ''
# 帖子文本
text = ''
# 当前爬取的url
url = ''
# 随机用户ID
user_id = ''
# 图片flag
has_img = 0
# get title flag
get_title_flag = 0
def __init__(self):
DgPipeline.user_id = get_random_user(contentSettings.CREATE_POST_USER)
# process the data
def process_item(self, item, spider):
self.get_title_flag += 1
# pipeline for content
if spider.name == contentSettings.SPIDER_NAME:
# 获取当前网页url
DgPipeline.url = item['url']
# 获取post title
if len(item['title']) == 0:
title_tmp = ''
else:
title_tmp = item['title'][0]
# 替换标题中可能会引起 sql syntax 的符号
# 对于分页的文章,只取得第一页的标题
if self.get_title_flag == 1:
# 使用beautifulSoup格什化标题
soup_title = BeautifulSoup(title_tmp, "lxml")
title = ''
# 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、
# 多行的空格、换行,使用stripped_strings获取文本
for string in soup_title.stripped_strings:
title += string
title = title.replace("'", "”").replace('"', '“')
DgPipeline.title = title
# 获取正post内容
if len(item['text']) == 0:
text_temp = ''
else:
text_temp = item['text'][0]
# 获取图片
reg_img = re.compile(r'<img.*>')
imgs = reg_img.findall(text_temp)
for img in imgs:
DgPipeline.has_img = 1
# matchObj = re.search('.*src="(.*)"{2}.*', img, re.M | re.I)
match_obj = re.search('.*src="(.*)".*', img, re.M | re.I)
img_url_tmp = match_obj.group(1)
# 去除所有Http:标签
img_url_tmp = img_url_tmp.replace("http:", "")
# 对于<img src="http://a.jpg" title="a.jpg">这种情况单独处理
imgUrl_tmp_list = img_url_tmp.split('"')
img_url_tmp = imgUrl_tmp_list[0]
# 加入http
imgUrl = 'http:' + img_url_tmp
list_name = imgUrl.split('/')
file_name = list_name[len(list_name)-1]
# if os.path.exists(settings.IMAGES_STORE):
# os.makedirs(settings.IMAGES_STORE)
# 获取图片本地存储路径
file_path = contentSettings.IMAGES_STORE + file_name
# 获取图片并上传至本地
urllib.request.urlretrieve(imgUrl, file_path)
upload_img_result_json = uploadImage(file_path, 'image/jpeg', DgPipeline.user_id)
# 获取上传之后返回的服务器图片路径、宽、高
img_u = upload_img_result_json['result']['image_url']
img_w = upload_img_result_json['result']['w']
img_h = upload_img_result_json['result']['h']
img_upload_flag = str(img_u)+';'+str(img_w)+';'+str(img_h)
# 在图片前后插入字符标记
text_temp = text_temp.replace(img, '[dgimg]' + img_upload_flag + '[/dgimg]')
# 使用beautifulSoup格什化HTML
soup = BeautifulSoup(text_temp, "lxml")
text = ''
# 对于bs之后的html树形结构,不使用.prettify(),对于bs, prettify后每一个标签自动换行,造成多个、
# 多行的空格、换行
for string in soup.stripped_strings:
text += string + '\n'
# 替换因为双引号为中文双引号,避免 mysql syntax
DgPipeline.text = self.text + text.replace('"', '“')
# 对于分页的文章,每一页之间加入换行
# DgPipeline.text += (DgPipeline.text + '\n')
# pipeline for url
elif spider.name == urlSettings.SPIDER_NAME:
db_object = dbhandle_online()
cursor = db_object.cursor()
for url in item['url']:
linkmd5id = get_linkmd5id(url)
spider_name = contentSettings.SPIDER_NAME
site = urlSettings.DOMAIN
gid = urlSettings.GROUP_ID
module = urlSettings.MODULE
status = '0'
sql_search = 'select md5_url from dg_spider.dg_spider_post where md5_url="%s"' % linkmd5id
sql = 'insert into dg_spider.dg_spider_post(md5_url, url, spider_name, site, gid, module, status) ' \
'values("%s", "%s", "%s", "%s", "%s", "%s", "%s")' \
% (linkmd5id, url, spider_name, site, gid, module, status)
try:
# 判断url是否存在,如果不存在,则插入
cursor.execute(sql_search)
result_search = cursor.fetchone()
if result_search is None or result_search[0].strip() == '':
cursor.execute(sql)
result = cursor.fetchone()
db_object.commit()
except Exception as e:
print(">>> catch exception !")
print(e)
db_object.rollback()
return item
# spider开启时被调用
def open_spider(self, spider):
pass
# sipder 关闭时被调用
def close_spider(self, spider):
if spider.name == contentSettings.SPIDER_NAME:
# 数据入库:235
url = DgPipeline.url
title = DgPipeline.title
content = DgPipeline.text
user_id = DgPipeline.user_id
dbhandle_insert_content(url, title, content, user_id, DgPipeline.has_img)
# 更新status状态为1(已经爬取过内容)
"""此项已在spider启动时设置"""
# dbhandle_update_status(url, 1)
# 处理文本、设置status、上传至dgCommunity.dg_post
# 如果判断has_img为1,那么上传帖子
if DgPipeline.has_img == 1:
if title.strip() != '' and content.strip() != '':
spider.logger.info('has_img=1,title and content is not null! Uploading post into db...')
post_handel(url)
else:
spider.logger.info('has_img=1,but title or content is null! ready to exit...')
pass
else:
spider.logger.info('has_img=0, changing status and ready to exit...')
pass
elif spider.name == urlSettings.SPIDER_NAME:
pass
- items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
# douguo Spider Item
# @author zhangjianfei
# @date 2017/04/07
import scrapy
class DgspiderUrlItem(scrapy.Item):
url = scrapy.Field()
class DgspiderPostItem(scrapy.Item):
url = scrapy.Field()
title = scrapy.Field()
text = scrapy.Field()
- settings.py
这个文件只需要更改或加上特定的配置项
BOT_NAME = 'DgSpider'
SPIDER_MODULES = ['DgSpider.spiders']
NEWSPIDER_MODULE = 'DgSpider.spiders'
# 注册PIPELINES
ITEM_PIPELINES = {
'DgSpider.pipelines.DgPipeline': 1
}
- mysqlUtils.py
import pymysql
import pymysql.cursors
import os
def dbhandle_online():
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
return conn
def dbhandle_local():
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=True
# use_unicode=False
)
return conn
def dbhandle_geturl(gid):
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
cursor = conn.cursor()
sql = 'select url,spider_name,site,gid,module from dg_spider.dg_spider_post where status=0 and gid=%s limit 1' % gid
try:
cursor.execute(sql)
result = cursor.fetchone()
conn.commit()
except Exception as e:
print("***** exception")
print(e)
conn.rollback()
if result is None:
os._exit(0)
else:
url = result[0]
spider_name = result[1]
site = result[2]
gid = result[3]
module = result[4]
return url.decode(), spider_name.decode(), site.decode(), gid.decode(), module.decode()
def dbhandle_insert_content(url, title, content, user_id, has_img):
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
cur = conn.cursor()
# 如果标题或者内容为空,那么程序将退出,篇文章将会作废并将status设置为1,爬虫继续向下运行获得新的URl
if content.strip() == '' or title.strip() == '':
sql_fail = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' % ('1', url)
try:
cur.execute(sql_fail)
result = cur.fetchone()
conn.commit()
except Exception as e:
print(e)
conn.rollback()
os._exit(0)
sql = 'update dg_spider.dg_spider_post set title="%s",content="%s",user_id="%s",has_img="%s" where url="%s" ' \
% (title, content, user_id, has_img, url)
try:
cur.execute(sql)
result = cur.fetchone()
conn.commit()
except Exception as e:
print(e)
conn.rollback()
return result
def dbhandle_update_status(url, status):
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
cur = conn.cursor()
sql = 'update dg_spider.dg_spider_post set status="%s" where url="%s" ' \
% (status, url)
try:
cur.execute(sql)
result = cur.fetchone()
conn.commit()
except Exception as e:
print(e)
conn.rollback()
return result
def dbhandle_get_content(url):
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
cursor = conn.cursor()
sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=1 and url="%s" limit 1' % url
try:
cursor.execute(sql)
result = cursor.fetchone()
conn.commit()
except Exception as e:
print("***** exception")
print(e)
conn.rollback()
if result is None:
os._exit(1)
title = result[0]
content = result[1]
user_id = result[2]
gid = result[3]
return title.decode(), content.decode(), user_id.decode(), gid.decode()
# 获取爬虫初始化参数
def dbhandle_get_spider_param(url):
host = '192.168.1.235'
user = 'root'
passwd = 'douguo2015'
charset = 'utf8'
conn = pymysql.connect(
host=host,
user=user,
passwd=passwd,
charset=charset,
use_unicode=False
)
cursor = conn.cursor()
sql = 'select title,content,user_id,gid from dg_spider.dg_spider_post where status=0 and url="%s" limit 1' % url
result = ''
try:
cursor.execute(sql)
result = cursor.fetchone()
conn.commit()
except Exception as e:
print("***** exception")
print(e)
conn.rollback()
title = result[0]
content = result[1]
user_id = result[2]
gid = result[3]
return title.decode(), content.decode(), user_id.decode(), gid.decode()
一些特别的常亮及参数,也是用py文件加入
urlSettings.py:
# 爬取域名
DOMAIN = 'eastlady.cn'
# 爬虫名
""" URL爬虫模块名,不可变 """
SPIDER_NAME = 'DgUrlSpider'
GROUP_ID = '33'
MODULE = '999'
# 文章列表页起始爬取URL
START_LIST_URL = 'http://www.eastlady.cn/emotion/pxgx/1.html'
# 文章列表循环规则
LIST_URL_RULER_PREFIX = 'http://www.eastlady.cn/emotion/pxgx/'
LIST_URL_RULER_SUFFIX = '.html'
LIST_URL_RULER_LOOP = 30
# 文章URL爬取规则XPATH
POST_URL_XPATH = '//div[@class="article_list"]/ul/li/span[1]/a[last()]/@href'
contentSetting:
# -*- coding: utf-8 -*-
# Scrapy settings for DgSpider project
# 图片储存
IMAGES_STORE = 'D:\\pics\\jfss\\'
# 爬取域名
DOMAIN = 'nrsfh.com'
# 图片域名前缀
DOMAIN_HTTP = "http:"
# 随机发帖用户
CREATE_POST_USER = '37619,18441390'
# 爬虫名
SPIDER_NAME = 'DgContentSpider'
# 文章URL爬取规则XPATH
POST_TITLE_XPATH = '//div[@class="title"]'
POST_CONTENT_XPATH = '//div[@class="bodycss"]'
启动爬虫
进入爬虫代码所在的文件夹,右击:在此打开命令行窗口,先执行:
Scrapy crawl UrlSpider
进行爬取所有的URL,并入库
再执行:
Scrapy crawl ContentSpider
从数据库中读取URL,抓取网页内容,入库
当然,也可以洗衣歌windos批处理脚本,持续不断的执行Scrapy crawl ContentSpider:
@echo DOUGUO window Spider
cd D:\Scrapy\DgSpider
for /l %%i in (1,1,7000) do scrapy crawl DgContentSpider
:end
@echo SUCCESS! PRESS ANAY KEY TO EXIT!
@Pause>nul
当然,这种方式比较笨拙,最好还是启用cmdline,加入多线程,这里不说明
处理完上面的所有步骤,就能成功地抓取到网页数据: