目的:使用scrapy爬取该知乎用户的粉丝信息,包括每一个粉丝的名称、回答问题数量、文章数量和关注者数量。信息获取成功后,保存到Mysql数据库。
代码:
items.py
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://docs.scrapy.org/en/latest/topics/items.html
import scrapy
class PonymaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field() #用户名
avatarUrl = scrapy.Field() #用户图像链接
answerCount = scrapy.Field() #回答数
articlesCount = scrapy.Field() #文章数
followerCount = scrapy.Field() #关注数量
#pass
spider py代码
# -*- coding: utf-8 -*-
import scrapy
import re
from ponyma.items import PonymaItem
class ZhihuSpider(scrapy.Spider):
name = 'zhihu'
allowed_domains = ['www.zhihu.com']
start_urls = ['https://www.zhihu.com/people/ponyma/followers?page=1']
# def start_requests(self): #重新定义start_request对象
# start_urls = ['https://www.zhihu.com/people/ponyma/followers?page={}'.format(page) for page in self.pages]
# for url in start_urls: # 遍历start_urls发出请求
# yield scrapy.Request(url)
def parse(self, response):
text = response.body.decode()
#print(text)
result_list = re.findall('\{"id":.*?"isRealname":.*?\}', text)
#print(result_list)
for data in result_list[1:]:
item = PonymaItem()
#print(data)
#使用正则表达式获取相关信息
item['name'] = re.findall('"name":"(.*?)"', data)[0] # 用户名
item['avatarUrl'] = re.findall('"avatarUrl":"(.*?)"', data)[0].encode().decode('unicode_escape') # 用户图像链接
item['answerCount'] = re.findall('"answerCount":(.*?),', data)[0] # 回答数
item['articlesCount'] = re.findall('"articlesCount":(.*?),', data)[0] # 文章数
item['followerCount'] = re.findall('"followerCount":(.*?),', data)[0] # 关注数量
yield item
#获取下一页信息
pages = list(range(2, 101))
for page in pages:
url = 'https://www.zhihu.com/people/ponyma/followers?page={}'.format(page)
print('当前url地址:%s'%url)
yield scrapy.Request(
url,
callback=self.parse,
meta={'item': item}
)
#pass
pipelines.py代码
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
# class PonymaPipeline(object):
# def process_item(self, item, spider):
# return item
#保存到数据库
class PonymaPipeline(object):
def __init__(self):
self.db = None
self.cursor = None
def process_item(self, item, spider):
# 数据库的名字和密码自己知道!!!bole是数据库的名字
self.db = pymysql.connect(host='localhost', user='XXXXX', passwd='XXXXX', db='zhihu')
self.cursor = self.db.cursor()
# 由于可能报错所以在这重复拿了一下item中的数据,存在了data的字典中
data = {
"name": item['name'],
"avatarUrl": item['avatarUrl'],
"answerCount": item['answerCount'],
"articlesCount": item['articlesCount'],
"followerCount":item['followerCount']
}
# 注意:MySQL数据库命令语句
insert_sql = "INSERT INTO fensi (name,avatarUrl,answerCount,articlesCount,followerCount) VALUES (%s,%s,%s,%s,%s)"
try:
self.cursor.execute(insert_sql, (
data['name'],data['avatarUrl'], data['answerCount'],data['articlesCount'],data['followerCount']))
self.db.commit()
print('成功了')
except Exception as e:
print('问题数据跳过!.......', e)
self.db.rollback()
self.cursor.close()
self.db.close()
setting.py代码
# -*- coding: utf-8 -*-
# Scrapy settings for ponyma project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://docs.scrapy.org/en/latest/topics/settings.html
# https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
# https://docs.scrapy.org/en/latest/topics/spider-middleware.html
import random
BOT_NAME = 'ponyma'
SPIDER_MODULES = ['ponyma.spiders']
NEWSPIDER_MODULE = 'ponyma.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'ponyma (+http://www.yourdomain.com)'
# Obey robots.txt rules
LOG_LEVEL = "WARNING"
ROBOTSTXT_OBEY = False
USER_AGENTS_LIST = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
USER_AGENT = random.choice(USER_AGENTS_LIST)
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
# 'User-Agent':"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36",
'User-Agent':USER_AGENT
}
}
ITEM_PIPELINES = {
'ponyma.pipelines.PonymaPipeline': 300,
}
# Enable and configure HTTP caching (disabled by default)
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
运行结果: