像读文献一样,读好的代码
文章目录
豆瓣电影
spider.py
import requests
from lxml import etree
# 1. 将目标网站的页面抓取下来
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36",
'Referer': "https://movie.douban.com/explore"
}
url = 'https://movie.douban.com/cinema/nowplaying/chengdu/'
response = requests.get(url, headers=headers)
# print(response.text)
html = response.text
# 2. 将抓取的数据进行提取
html = etree.HTML(html)
# ul有两个,正在热映和即将上映的,取第一个
ul = html.xpath("//ul[@class='lists']")[0]
# print(etree.tostring(ul,encoding='utf-8').decode("utf-8"))
# 当前ul 下所有的Li
lis = ul.xpath("./li")
movies = []
for li in lis:
# print(etree.tostring(li,encoding='utf-8').decode('utf-8'))
title = li.xpath("@data-title")[0]
score = li.xpath("@data-score")[0]
duration = li.xpath("@data-duration")[0]
region = li.xpath("@data-region")[0]
director = li.xpath("@data-director")[0]
actor = li.xpath("@data-actors")
poster = li.xpath(".//img/@src")[0]
movie = {
'title': title,
'score': score,
'duration': duration,
'region': region,
'director': director,
'actor': actor,
'thumbnail': poster
}
movies.append(movie)
print(movies)
README.md
# 爬取豆瓣电影,练习xpath
## url:https://movie.douban.com/cinema/nowplaying/chengdu/
### 注意:
* response.text返回的是一个经过解码后的字符串,是str(unicode)类型
* response.content返回的是一个原生的字符串,是从网页上抓取下来的,没有经过处理的字符串,
是bytes类型
* xpath规则:
+ ./ 当前节点获取所有的子节点
+ @属性
+ .//当前节点所有节点
+ .//img/@src img标签里的src属性
* thumbnail 缩略图
电影天堂
from lxml import etree
import requests
BASE_DOMAIN = 'http://www.ygdy8.net'
HEADERS = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36",
}
def get_detail_urls(url):
response = requests.get(url, headers=HEADERS)
# print(response.text)
# print(response.content.decode('gbk'))
html = response.text
html = etree.HTML(html)
detail_urls = html.xpath("//table[@class='tbspan']//a/@href")
# for detail_url in detail_urls:
# print(BASE_DOMAIN+detail_url)
detail_urls = map(lambda url:BASE_DOMAIN+url, detail_urls)
return detail_urls
def parse_detail_page(url):
movie = {}
response = requests.get(url,headers=HEADERS)
text = response.content.decode('gbk')
html = etree.HTML(text)
title = html.xpath("//div[@class='title_all']//font[@color='#07519a']/text()")[0]
movie['title'] = title
zoomE = html.xpath("//div[@id='Zoom']")[0]
imgs = zoomE.xpath(".//img/@src")
if imgs is list:
cover = imgs[0]
screenshot = imgs[1]
movie['cover'] = cover
movie['screenshot'] = screenshot
else:
movie['cover'] = imgs
def parse_infor(infor, rule):
return infor.replace(rule,"").strip()
infors = zoomE.xpath(".//text()")
for index,infor in enumerate(infors):
# print(infor)
# print(index)
# print("="*30)
if infor.startswith("◎年 代"):
infor = parse_infor(infor,"◎年 代")
movie['year'] = infor
elif infor.startswith("◎产 地"):
infor = parse_infor(infor, "◎产 地")
movie['country'] = infor
elif infor.startswith("◎类 别"):
infor = parse_infor(infor, "◎类 别")
movie['category'] = infor
elif infor.startswith("◎语 言"):
infor = parse_infor(infor, "◎语 言")
movie['language'] = infor
elif infor.startswith("◎IMDb评分"):
infor = parse_infor(infor, "◎IMDb评分")
movie['imdb_rating'] = infor
elif infor.startswith("◎片 长"):
infor = parse_infor(infor, "◎片 长")
movie['duration'] = infor
elif infor.startswith("◎导 演"):
infor = parse_infor(infor, "◎导 演")
movie['director'] = infor
elif infor.startswith("◎主 演"):
infor = parse_infor(infor, "◎主 演")
actors = [infor]
for x in range(index+1, len(infors)):
actor = infors[x].strip()
if actor.startswith("◎简 介"):
break
actors.append(actor)
movie['actors'] = actors
elif infor.startswith("◎简 介"):
infor = parse_infor(infor, "◎简 介")
# movie['introduction'] = infor
for x in range(index+1,len(infors)):
profile = infors[x].strip()
if profile.startswith("【下载地址】"):
break
movie['profile'] = profile
dowmload_url = html.xpath("//td[@bgcolor='#fdfddf']/a/@href")[0]
movie['download_url'] = dowmload_url
return movie
def spider():
# {}占个位置,format填充
base_url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'
movies = []
for x in range(1,8):
# 第一个for循环控制总共有7页
url = base_url.format(x)
detail_urls = get_detail_urls(url)
for detail_url in detail_urls:
# 第二个for循环遍历一页中所有电影详情url
movie = parse_detail_page(detail_url)
movies.append(movie)
print(movie)
# print(movies)
if __name__ == '__main__':
spider()
中国天气网
import requests
from bs4 import BeautifulSoup
from pyecharts import Bar
ALL_DATA = []
def parse_page(url):
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
}
response = requests.get(url, headers=headers)
text = response.content.decode('utf-8')
soup = BeautifulSoup(text, 'html5lib')
conMidtab = soup.find('div', class_='conMidtab')
tables = conMidtab.find_all('table')
for table in tables:
trs = table.find_all('tr')[2:]
for index, tr in enumerate(trs):
tds = tr.find_all('td')
city_td = tds[0]
if index == 0:
city_td = tds[1]
city = list(city_td.stripped_strings)[0]
temp_td = tds[-2]
min_temp = list(temp_td.stripped_strings)[0]
ALL_DATA.append({"city": city, "min_temp": int(min_temp)})
# print({"city":city,"min_temp":min_temp})
def main():
# url = 'http://www.weather.com.cn/textFC/hb.shtml#'
# url = 'http://www.weather.com.cn/textFC/db.shtml'
# url = 'http://www.weather.com.cn/textFC/hd.shtml'
# url = 'http://www.weather.com.cn/textFC/gat.shtml'
urls = [
'http://www.weather.com.cn/textFC/hb.shtml',
'http://www.weather.com.cn/textFC/db.shtml',
'http://www.weather.com.cn/textFC/hd.shtml',
'http://www.weather.com.cn/textFC/hz.shtml',
'http://www.weather.com.cn/textFC/hn.shtml',
'http://www.weather.com.cn/textFC/xb.shtml',
'http://www.weather.com.cn/textFC/xn.shtml',
'http://www.weather.com.cn/textFC/gat.shtml'
]
for url in urls:
parse_page(url)
# 分析数据
# 根据最低气温进行排序
ALL_DATA.sort(key=lambda data: data['min_temp'])
data = ALL_DATA[0:10]
cities = list(map(lambda x: x['city'], data))
temps = list(map(lambda x: x['min_temp'], data))
chart = Bar("中国天气最低温排行榜")
chart.add('', cities, temps)
chart.render('temperature.html')
if __name__ == '__main__':
main()
# ALL_DATA = [
# {"city": "北京", 'min_temp': '-8'},
# {"city": "天津", 'min_temp': '-9'}
# ]
#
# def sort_key(data):
# min_temp = data['min_temp']
# return min_temp
古诗文网
import re
import requests
def parse_page(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36"
}
response = requests.get(url, headers=headers)
text = response.text
titles = re.findall(r'<div\sclass="cont".*?<b>(.*?)</b>', text, re.S)
dynasties = re.findall(r'<p class="source">.*?<a.*?>(.*?)</a>', text, re.S)
authors = re.findall(r'<p class="source">.*?<a.*?>.*?<a.*?>(.*?)</a>', text, re.S)
content_tags = re.findall(r'<div class="contson".*?>(.*?)</div>', text, re.S)
contents = []
for content in content_tags:
x = re.sub(r'<.*?>', '', content)
contents.append(x.strip())
poems = []
for value in zip(titles, dynasties, authors, contents):
title, dynasty, author, content = value
poem = {
'title': title,
'dynasty': dynasty,
'author': author,
'content': content
}
poems.append(poem)
for poem in poems:
print(poem)
print("=" * 40)
def main():
# url = 'https://www.gushiwen.org/default_1.aspx'
# parse_page(url)
for x in range(1, 11):
url = 'https://www.gushiwen.org/default_%s.aspx' % x
parse_page(url)
if __name__ == '__main__':
main()
线程进程协程应用
asynic
import asyncio
import threading
@asyncio.coroutine
def hello():
print("Hello world! (%s)"% threading.currentThread())
# 异步调用asyncio.sleep(1):
r = yield from asyncio.sleep(1)
print("Hello again! (%s)"% threading.currentThread())
# 获取eventLoop:
loop = asyncio.get_event_loop()
tasks = [hello(), hello()]
# 执行coroutine
loop.run_until_complete(asyncio.wait(tasks))
loop.close()
coroutine.py
def consumer():
r = ''
while True:
n = yield r
if not n:
print('执行return')
return
print('[CONSUMER] Consuming %s...' % n)
r = '200 OK'
def produce(c):
c.send(None)
n = 0
while n < 5:
n = n + 1
print('[PRODUCER] Producing %s...' % n)
r = c.send(n)
print('[PRODUCER]Consumer return: %s' % r)
c.close()
多线程thread
import time
import threading
# # # 采用传统方式
# def coding():
# for x in range(3):
# print('正在写代码%s'%x)
# time.sleep(1)
#
# def drawing():
# for x in range(3):
# print('正在写画图%s'%x)
# time.sleep(1)
#
# def main():
# coding()
# drawing()
#
# if __name__ == '__main__':
# main()
# 采用多线程方式
def coding():
for x in range(3):
print('正在写代码%s'%threading.current_thread())
time.sleep(1)
def drawing():
for x in range(3):
print('正在写画图%s'%threading.current_thread())
time.sleep(1)
def main():
t1 = threading.Thread(target=coding)
t2 = threading.Thread(target=drawing)
t1.start()
t2.start()
print(threading.enumerate())
if __name__ == '__main__':
main()
多线程demo2
import threading
import time
class CodingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写代码%s' % threading.current_thread())
time.sleep(1)
class DrawingThread(threading.Thread):
def run(self):
for x in range(3):
print('正在写画图%s' % threading.current_thread())
time.sleep(1)
def main():
t1 = CodingThread()
t2 = DrawingThread()
t1.start()
t2.start()
if __name__ == '__main__':
main()
进程锁
import threading
# VALUE = 0
#
# gLock = threading.Lock()
#
# def add_value():
# global VALUE
# for x in range(1000000):
# VALUE += 1
# print('value: %d'%VALUE)
#
#
# def main():
# for x in range(2):
# t = threading.Thread(target=add_value)
# t.start()
#
# if __name__ == '__main__':
# main()
VALUE = 0
gLock = threading.Lock()
def add_value():
global VALUE
gLock.acquire()
for x in range(1000000):
VALUE += 1
gLock.release()
print('value: %d'%VALUE)
def main():
for x in range(2):
t = threading.Thread(target=add_value)
t.start()
if __name__ == '__main__':
main()
生产者-消费者模式
import threading
import random
import time
gMoney = 1000
gCondition = threading.Condition()
gTotalTimes = 10
gTimes = 0
class Producer(threading.Thread):
def run(self):
global gMoney
global gTimes
while True:
money = random.randint(100,1000)
gCondition.acquire()
if gTimes >= gTotalTimes:
gCondition.release()
break
gMoney += money
print('%s生产了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gTimes += 1
gCondition.notify_all()
gCondition.release()
time.sleep(0.5)
class Consumer(threading.Thread):
def run(self):
global gMoney
while True:
money = random.randint(100,1000)
gCondition.acquire()
while gMoney < money:
if gTimes >= gTotalTimes:
gCondition.release()
return
print("%s消费者准备消费%d元钱,剩余%d元钱,不足!"%(threading.current_thread(),money,gMoney))
gCondition.wait()
gMoney -= money
print('%s消费者消费了%d元钱,剩余%d元钱'%(threading.current_thread(),money,gMoney))
gCondition.release()
time.sleep(0.5)
def main():
for x in range(3):
t = Consumer(name="消费者线程%d" % x)
t.start()
for x in range(4):
t = Producer(name="生产者线程%d"%x)
t.start()
if __name__ == '__main__':
main()
队列
import time
from queue import Queue
import threading
# q.put(1)
# q.put(2)
#
# print(q.qsize(),q.empty(),q.full())
# for x in range(4):
# q.put(x)
#
# for x in range(4):
# print(q.get())
def set_value(q):
index = 0
while True:
q.put(index)
index += 1
time.sleep(3)
def get_value(q):
while True:
print(q.get())
def main():
q = Queue(4)
# args将参数传给set_value函数
t1 = threading.Thread(target=set_value, args=[q])
t2 = threading.Thread(target=get_value, args=[q])
t1.start()
t2.start()
if __name__ == '__main__':
main()
斗图
import os
import re
import requests
from lxml import etree
from urllib import request
def parse_page(url):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36"
}
response = requests.get(url,headers=headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[\??\.!!。,]','',alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
request.urlretrieve(img_url,'images/' + filename)
def main():
for x in range(1,101):
url = 'https://www.doutula.com/photo/list/?page=%d'%x
parse_page(url)
# break
if __name__ == '__main__':
main()
斗图协程
import os
import re
from queue import Queue
import requests
from lxml import etree
from urllib import request
import threading
class Producer(threading.Thread):
headers = {
'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)"
" Chrome/69.0.3497.100 Safari/537.36"
}
# *args,**kwargs代表任意参数,父类的,page_queue,img_queue是自己的
def __init__(self,page_queue,img_queue,*args,**kwargs):
# super()确保父类正常初始化
super(Producer, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
while True:
if self.page_queue.empty():
break
url = self.page_queue.get()
self.parse_page(url)
def parse_page(self, url):
response = requests.get(url, headers=self.headers)
text = response.text
html = etree.HTML(text)
imgs = html.xpath('//div[@class="page-content text-center"]//img[@class!="gif"]')
for img in imgs:
img_url = img.get('data-original')
alt = img.get('alt')
alt = re.sub(r'[\??\.!!。,]', '', alt)
suffix = os.path.splitext(img_url)[1]
filename = alt + suffix
# 抓取元祖
self.img_queue.put((img_url, filename))
class Consumer(threading.Thread):
def __init__(self,page_queue,img_queue,*args,**kwargs):
super(Consumer, self).__init__(*args,**kwargs)
self.page_queue = page_queue
self.img_queue = img_queue
def run(self):
# 分别解剖
while True:
if self.img_queue.empty() and self.page_queue.empty():
break
img_url,filename = self.img_queue.get()
request.urlretrieve(img_url, 'images/' + filename)
print(filename + "已下载完成")
def main():
page_queue = Queue(100)
img_queue = Queue(1000)
for x in range(1,20):
url = 'https://www.doutula.com/photo/list/?page=%d'%x
page_queue.put(url)
for x in range(5):
t = Producer(page_queue,img_queue)
t.start()
for x in range(5):
t = Consumer(page_queue, img_queue)
t.start()
if __name__ == '__main__':
main()
README.md
# threading
## 比较采用传统方式和多线程方式
## 注意:
1. 查看总共多少线程 print(threading.enumerate())
demo1的结果 [<_MainThread(MainThread, started 8008)>, <Thread(Thread-1, started 18320)>, <Thread(Thread-2, started 6132)>]
2. 查看当前线程名字
3. 封装为类,更加独立
4. 多线程的缺陷,修改全局变量时,执行多次可能线程间代码会冲突,demo3,同时给VALUE+1,需要加锁
5. Lock生产者消费者模式,demo4,如何跳出循环,gTotalTimes=10 gTimes=0生产者消费者条件
6. threading.Condition可以再没有数据的时候处于阻塞等待状态,wait(),ntify_all()通知机制,更加高效,省内存
7. queue 线程安全,原子性
## 应用:斗图吧获取表情包
### url:https://www.doutula.com/photo/list/?page=%d
### 注意:
1. 框架: 先写main函数,里面写url,再写parse函数
2. demo8 为异步,多线程,结构:生产者消费者模式
每一页的url→生产者(获取表情url)→每个表情url→消费者(下载表情)
scrapy
糗事百科
start.py
from scrapy import cmdline
# cmdline.execute("scrapy crawl qsbk_spider".split())
cmdline.execute(["scrapy",'crawl','qsbk_spider'])
qsbk_spider.py
# -*- coding: utf-8 -*-
import scrapy
# from scrapy.selector.unified import SelectorList
from ..items import QsbkItem
class QsbkSpiderSpider(scrapy.Spider):
name = 'qsbk_spider'
allowed_domains = ['qiushibaike.com']
start_urls = ['https://www.qiushibaike.com/text/page/1/']
base_domain = "https://www.qiushibaike.com"
def parse(self, response):
# selectorlist
duanzidivs = response.xpath('//div[@id="content-left"]/div')
for duanzidiv in duanzidivs:
# selector
author = duanzidiv.xpath('.//h2/text()').get().strip()
content = duanzidiv.xpath('.//div[@class="content"]//text()').getall()
content = ''.join(content).strip()
item = QsbkItem(author=author, content=content)
yield item
next_url = response.xpath('//ul[@class="pagination"]/li[last()]/a/@href').get()
next_url = response.urljoin(next_url)
yield scrapy.Request(next_url, callback=self.parse)
if not next_url:
return
# print(next_url)
# if not next_url:
# return
# else:
# yield scrapy.Request(next_url,callback=self.parse)
items.py
import scrapy
class QsbkItem(scrapy.Item):
collection = table = 'duanzi'
author = scrapy.Field()
content = scrapy.Field()
pipelines.py
from scrapy.exporters import JsonLinesItemExporter
class QsbkPipeline(object):
def __init__(self):
self.fp = open('duanzi.json', 'wb')
self.exporter = JsonLinesItemExporter(self.fp,ensure_ascii=False,encoding='utf-8')
def open_item(self):
print("保存开始")
def process_item(self, item, spider):
self.exporter.export_item(item)
return item
def close_item(self):
self.fp.close()
print("爬虫结束了")
import pymongo
class MongoPipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),
mongo_db=crawler.settings.get('MONGO_DB')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
def process_item(self,item,spider):
self.db[item.collection].insert(dict(item))
return item
def close_spider(self,spider):
self.client.close()
宝马5系
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl bmw5".split())
spiders
bmw5.py
# -*- coding: utf-8 -*-
import scrapy
from ..items import BmwItem
from scrapy.spiders import CrawlSpider, Rule
from scrapy.linkextractors import LinkExtractor
class Bmw5Spider(CrawlSpider):
name = 'bmw5'
allowed_domains = ['car.autohome.com.cn']
start_urls = ['https://car.autohome.com.cn/pic/series/65.html']
rules = (
Rule(LinkExtractor(allow=r'https://car.autohome.com.cn/pic/series/65.+'),callback="parse_page", follow=True),
)
def parse_page(self, response):
category = response.xpath('//div[@class="uibox"]//text()').get()
srcs = response.xpath('//div[@class="uibox"]//ul/li/a/img/@src').getall()
srcs = list(map(lambda x:response.urljoin(x.replace('t_','')),srcs))
yield BmwItem(category=category,image_urls=srcs)
def test_parse(self,response):
uiboxs = response.xpath('//div[@class="uibox"]')[1:]
for uibox in uiboxs:
category = uibox.xpath('./div[@class="uibox-title"]/a/text()').get()
urls = uibox.xpath('.//ul/li/a/img/@src').getall()
urls = list(map(lambda url: response.urljoin(url), urls))
item = BmwItem(category=category, image_urls=urls)
yield item
items.py
import scrapy
class BmwItem(scrapy.Item):
category = scrapy.Field()
image_urls = scrapy.Field()
iamges = scrapy.Field()
pipelines.py
import os
from urllib import request
from scrapy.pipelines.images import ImagesPipeline
from . import settings
class BmwPipeline(object):
def __init__(self):
self.path = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
if not os.path.exists(self.path):
os.mkdir(self.path)
def process_item(self, item, spider):
category = item['category']
urls = item['urls']
category_path = os.path.join(self.path,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
for url in urls:
image_name = url.split('_')[-1]
request.urlretrieve(url,os.path.join(category_path,image_name))
return item
class BMWImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 这个方法本身就是去发送请求的
request_objs = super(BMWImagePipeline,self).get_media_requests(item,info)
for request_obj in request_objs:
request_obj.item = item
return request_objs
def file_path(self, request, response=None, info=None):
path = super(BMWImagePipeline, self).file_path(request,response,info)
category = request.item.get('category')
images_store = settings.IMAGES_STORE
category_path = os.path.join(images_store,category)
if not os.path.exists(category_path):
os.mkdir(category_path)
image_name = path.replace("full/","")
image_path = os.path.join(category_path,image_name)
return image_path
settings.py
# 图片下载的路径,供Images pipelines使用
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)),'images')
简书
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl js".split())
spiders
js.py
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from ..items import ArticleItem
from ..middlewares import SeleniumDownloadMiddleware
class JsSpider(CrawlSpider):
name = 'js'
allowed_domains = ['jianshu.com']
start_urls = ['https://www.jianshu.com/']
rules = (
Rule(LinkExtractor(allow=r'.*/p/[0-9a-z]{12}.*'),callback='', follow=True),
)
def parse_detail(self, response):
print(response.text)
if self.start_urls:
pass
else:
title = response.xpath('//div[@class="article"]/h1/text()').get()
avatar = response.xpath('//div[@class="article"]//a[@class="avatar"]/img/@src').get()
author = response.xpath('//div[@class="author"]//span[@class="name"]/a/text()').get()
pub_time = response.xpath('//span[@class="publish-time"]/text()').get().replace("*","")
url = response.url.split("?")[0]
article_id = url.split('/')[-1]
content = response.xpath('//div[@class="show-content"]').get()
word_count = int(response.xpath('//span[@class="wordage"]/text()').get().replace("字数 ",""))
comment_count = response.xpath('//span[@class="comments-count"]/text()').get().replace("评论 ","")
like_count = response.xpath('//span[@class="likes-count"]/text()').get().replace("喜欢 ","")
read_count = response.xpath('//span[@class="views-count"]/text()').get().replace("阅读 ","")
subjects = ','.join(response.xpath('//div[@class="include-collection"]/a/div/text()').getall())
item = ArticleItem(
title=title,
avatar=avatar,
author=author,
article_id=article_id,
origin_url=response.url,
content = content,
pub_time=pub_time,
subjects=subjects,
word_count=word_count,
comment_count=comment_count,
read_count=read_count,
like_count=like_count,
)
yield item
items.py
import scrapy
class ArticleItem(scrapy.Item):
collection = scrapy.Field()
title = scrapy.Field()
content = scrapy.Field()
article_id = scrapy.Field()
origin_url = scrapy.Field()
author = scrapy.Field()
avatar = scrapy.Field()
read_count = scrapy.Field()
subjects = scrapy.Field()
like_count = scrapy.Field()
word_count = scrapy.Field()
pub_time = scrapy.Field()
comment_count = scrapy.Field(
pipelines.py
import pymysql
from twisted.enterprise import adbapi
from pymysql import cursors
class JianshuSpiderPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'Wangjinliang_45',
'database': 'jianshu',
'charset': 'utf8'
}
self.conn = pymysql.connect(**dbparams)
self.cursor = self.conn.cursor()
self._sql = None
def process_item(self, item, spider):
self.cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],item['subjects'],item['pub_time'],item['origin_url'],item['article_id'],item['read_count'],item['like_count'],item['word_count'],item['comment_count']))
self.conn.commit()
return item
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id,read_count,subjects,like_count,word_count,comment_count) values(null,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
class JianshuTwistedPipeline(object):
def __init__(self):
dbparams = {
'host': '127.0.0.1',
'port': 3306,
'user': 'root',
'password': 'Wangjinliang_45',
'database': 'jianshu',
'charset': 'utf8',
'cursorclass': cursors.DictCursor
}
self.dbpool = adbapi.ConnectionPool('pymysql',**dbparams)
self._sql = None
@property
def sql(self):
if not self._sql:
self._sql = """
insert into article(id,title,content,author,avatar,pub_time,origin_url,article_id,subjects,read_count,like_count,word_count,comment_count) values(NULL,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
"""
return self._sql
return self._sql
def process_item(self,item,spider):
defer = self.dbpool.runInteraction(self.insert_item,item)
defer.addErrback(self.handle_error,item,spider)
def insert_item(self,cursor,item):
cursor.execute(self.sql,(item['title'],item['content'],item['author'],item['avatar'],item['pub_time'],item['origin_url'],item['article_id'],item['subjects'],item['read_count'],item['like_count'],item['word_count'],item['comment_count']))
def handle_error(self,error,item,spider):
print('='*10+'error'+'='*10)
print(error)
print('='*10+'error'+'='*10)
搜房网
start.py
from scrapy import cmdline
cmdline.execute("scrapy crawl sfw".split())
sfw.py
# -*- coding: utf-8 -*-
import re
import requests
import scrapy
from ..items import NewhouseItem,ESFItem
class SfwSpider(scrapy.Spider):
name = 'sfw'
allowed_domains = ['fang.com']
start_urls = ['http://www.fang.com/SoufunFamily.htm']
def parse(self, response):
trs = response.xpath('//div[@class="letterSelt"]//div[@id="c02"]//tr')
province = None
for tr in trs:
tds = tr.xpath('./td[not(@class)]')
province_td = tds[0]
province_text = province_td.xpath('.//text()').get()
province_text = re.sub(r'\s','',province_text)
if province_text:
province = province_text
if province == '其它':
continue
city_td = tds[1]
city_links = city_td.xpath('./a')
for city_link in city_links:
city = city_link.xpath('./text()').get()
city_url = city_link.xpath('./@href').get()
if 'bj.' in city_url:
city_newhouse_url = 'http://newhouse.fang.com/house/s/'
city_esf_url = 'http://esf.fang.com/'
else:
city_newhouse_url = re.sub("fang.com","newhouse.fang.com/house/s",city_url)
city_esf_url = re.sub("fang.com","esf.fang.com",city_url)
# if city_newhouse_url in ['http://qianxinan.newhouse.fang.com/house/s/','http://esf.changji.newhouse.fang.com/house/s/',
# 'http://wj.esf.newhouse.fang.com/house/s/',]:
# return
# if city_newhouse_url or city_esf_url in ['http://esf.changji.esf.fang.com/','http://leizhou.esf.fang.com/',
# 'http://yongkang.esf.fang.com/','http://zjg.esf.fang.com/',
# 'http://qianxinan.newhouse.fang.com/house/s/','http://esf.changji.newhouse.fang.com/house/s/',
# 'http://wj.esf.newhouse.fang.com/house/s/','http://yiwu.esf.fang.com/', 'http://kaili.esf.fang.com/',
# 'http://zjg.esf.fang.com/',]:
# return
# print(province,city)
# print(city_esf_url,city_newhouse_url)
yield scrapy.Request(url=city_newhouse_url,callback=self.parse_newhouse,meta={'info':(province,city)})
yield scrapy.Request(url=city_esf_url,callback=self.parse_esf,meta={'info':(province,city),})
# break
# break
def parse_newhouse(self,response):
province,city = response.meta.get('info')
lis = response.xpath('//div[contains(@class,"nl_con")]/ul/li')
for li in lis:
name = li.xpath('.//div[@class="nlcd_name"]/a/text()').get()
if name:
name = name.strip()
else:
continue
house_style = li.xpath('.//div[contains(@class,"house_type")]/a/text()').getall()
# house_style = list(filter(lambda x:x.endswith("居"),house_style))
area = ''.join(li.xpath('.//div[contains(@class,"house_type")]/text()').getall())
area = re.sub('-|/ |\s','',area)
address = li.xpath('.//div[@class="address"]/a/@title').get()
district = li.xpath('.//div[@class="address"]/a/span/text()').get()
district = re.sub('\s|\[|\]','',str(district))
# print(district)
sale = ''.join(li.xpath('.//div[contains(@class,"fangyuan")]/span/text()').getall())
price = ''.join(li.xpath('.//div[@class="nhouse_price"]//text()').getall()).strip()
origin_url = li.xpath('.//div[@class="nlcd_name"]/a/@href').get()
item = NewhouseItem(name=name,house_style=house_style,area=area,address=address,district=district,sale=sale,price=price,origin_url=origin_url,province=province,city=city)
yield item
next_url = response.xpath('.//div[@class="page"]//li/a[@class="next"]/@href').get()
if next_url:
yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_newhouse,meta={'info':(province,city)})
def parse_esf(self,response):
province,city = response.meta.get('info')
# if "北京" in city:
# response = requests.get('http://esf.fang.com/')
dls = response.selector.xpath('//div[contains(@class,"shop_list")]/dl')
for dl in dls:
name = dl.xpath('.//dd/h4/a/@title').get()
if name:
name = name.strip()
else:
continue
infos = dl.xpath('.//p[@class="tel_shop"]/text()').getall()
infos = list(map(lambda x:re.sub('\s','',x),infos))
for info in infos:
if info.endswith("厅"):
house_style = info
elif info.endswith("㎡"):
house_area = info
elif "层" in info:
house_floor = info
elif "向" in info:
house_direction = info
elif "年" in info:
house_year = info.replace('建','')
house_append = dl.xpath('.//p[contains(@class,"label")]/span/text()').get()
if not house_append:
continue
address = ''.join(dl.xpath('.//p[@class="add_shop"]//span/text()').getall())
title = ''.join(dl.xpath('.//p[@class="add_shop"]/a/@title').getall())
price = ''.join(dl.xpath('.//dd[@class="price_right"]/span[position()=1]//text()').getall())
unit = ''.join(dl.xpath('.//dd[@class="price_right"]/span[position()=2]//text()').getall())
origin_url = dl.xpath('.//dd/h4/a/@href').get()
origin_url = response.urljoin(origin_url)
item = ESFItem(name=name,house_style=house_style,house_area=house_area,house_year=house_year,house_append=house_append,house_floor=house_floor,house_direction=house_direction,address=address,price=price,unit=unit,origin_url=origin_url,title=title,province=province,city=city)
yield item
next_url = response.xpath('//div[@class="page_al"]/p[position()=1]/a/@href').get()
if next_url:
yield scrapy.Request(url=response.urljoin(next_url),callback=self.parse_esf,meta={'info':(province,city)})
pipelines.py
from scrapy.exporters import JsonLinesItemExporter
from .items import NewhouseItem,ESFItem
import pymongo
class FangPipeline(object):
def __init__(self):
self.newhouse_fp = open('newhouse.json','wb')
self.esf_fp = open('esf.json','wb')
self.newhouse_exporter = JsonLinesItemExporter(self.newhouse_fp,ensure_ascii=False)
self.esf_exporter = JsonLinesItemExporter(self.esf_fp,ensure_ascii=False)
def process_item(self, item, spider):
if isinstance(item,NewhouseItem):
self.newhouse_exporter.export_item(item)
elif isinstance(item,ESFItem):
self.esf_exporter.export_item(item)
return item
def close_spider(self,spider):
self.newhouse_fp.close()
self.esf_fp.close()
class MongoPipeline(object):
def __init__(self,mongo_url,mongo_db):
self.mongo_url = mongo_url
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_url=crawler.settings.get('MONGO_URL'),mongo_db=crawler.settings.get('MONGO_DATABASE')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_url)
self.db = self.client[self.mongo_db]
self.db[NewhouseItem.collection].create_index([('id',pymongo.ASCENDING)])
self.db[ESFItem.collection].create_index([('id',pymongo.ASCENDING)])
def close_spider(self,spider):
self.client.close()
def process_item(self,item,spider):
if isinstance(item,NewhouseItem) or isinstance(item,ESFItem):
self.db[item.collection].insert(dict(item))
return item
middlewares.py
import random
class UserAgentDownloadMiddleware(object):
USER_AGENTS = [
'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.36',
'Mozilla/5.0 (compatible; U; ABrowse 0.6; Syllable) AppleWebKit/420+ (KHTML, like Gecko)',
'Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; Acoo Browser 1.98.744; .NET CLR 3.5.30729)',
'Mozilla/5.0 (compatible; MSIE 9.0; AOL 9.7; AOLBuild 4343.19; Windows NT 6.1; WOW64; Trident/5.0; FunWebProducts)',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'
]
def process_request(self,request,spider):
user_agent = random.choice(self.USER_AGENTS)
request.headers['User-Agent'] = user_agent
class ProxyMiddleware():
def __init__(self,proxy_url):
self.logger = logging.getLogger(__name__)
self.proxy_url = proxy_url
def get_random_proxy(self):
try:
response = requests.get(self.proxy_url)
if response.status_code == 200:
proxy = response.text
return proxy
except requests.ConnectionError:
return False
def process_request(self,request,spider):
n = 0
while request.meta.get('retry_times'):
proxy = self.get_random_proxy()
# if proxy:
uri = 'https://{proxy}'.format(proxy=proxy)
self.logger.debug('使用代理 '+ proxy)
n+=1
request.meta['proxy'] = uri
if n==3:
break
elif ConnectionLost:
break
@classmethod
def from_crawler(cls,crawler):
settings = crawler.settings
return cls(
proxy_url=settings.get('PROXY_URL')
)