Skip to content

Commit f067176

Browse files
authored
Merge pull request #4597 from StackStorm/statsd_exception_shouldnt_be_fatal
Treat any statsd metric driver related errors as non-fatal (just log the errors)
2 parents 8349d41 + b198d4d commit f067176

File tree

4 files changed

+126
-3
lines changed

4 files changed

+126
-3
lines changed

CHANGELOG.rst

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,8 +59,17 @@ Fixed
5959
* Make sure we don't log auth token and api key inside st2api log file if those values are provided
6060
via query parameter and not header (``?x-auth-token=foo``, ``?st2-api-key=bar``). (bug fix) #4592
6161
#4589
62-
* Fix rendering of config_context in orquesta task that references action in different pack
62+
* Fix rendering of config_context in orquesta task that references action in different pack.
6363
(bug fix) #4570
64+
* Add missing default config location (``/etc/st2/st2.conf``) to the following services:
65+
``st2actionrunner``, ``st2scheduler``, ``st2workflowengine``. (bug fix) #4596
66+
* Update statsd metrics driver so any exception thrown by statsd library is treated as non fatal.
67+
68+
Previously there was an edge case if user used a hostname instead of an IP address for metrics
69+
backend server address. In such scenario, if hostname DNS resolution failed, statsd driver would
70+
throw the exception which would propagate all the way up and break the application. (bug fix) #4597
71+
72+
Reported by Chris McKenzie.
6473

6574
2.10.3 - March 06, 2019
6675
-----------------------

st2common/st2common/metrics/drivers/statsd_driver.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,28 @@
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
1515

16+
import socket
17+
import logging as stdlib_logging
1618
from numbers import Number
1719

1820
import statsd
1921
from oslo_config import cfg
2022

23+
from st2common import log as logging
2124
from st2common.metrics.base import BaseMetricsDriver
2225
from st2common.metrics.utils import check_key
2326
from st2common.metrics.utils import get_full_key_name
27+
from st2common.util.misc import ignore_and_log_exception
28+
29+
30+
LOG = logging.getLogger(__name__)
31+
32+
# Which exceptions thrown by statsd library should be considered as non-fatal
33+
NON_FATAL_EXC_CLASSES = [
34+
socket.error,
35+
IOError,
36+
OSError
37+
]
2438

2539
__all__ = [
2640
'StatsdDriver'
@@ -30,11 +44,22 @@
3044
class StatsdDriver(BaseMetricsDriver):
3145
"""
3246
StatsD Implementation of the metrics driver
47+
48+
NOTE: Statsd uses UDP which is "fire and forget" and any kind of send error is not fatal. There
49+
is an issue with python-statsd library though which doesn't ignore DNS resolution related errors
50+
and bubbles them all the way up.
51+
52+
This of course breaks the application. Any kind of metric related errors should be considered
53+
as non-fatal and not degrade application in any way if an error occurs. That's why we wrap all
54+
the statsd library calls here to ignore the errors and just log them.
3355
"""
56+
3457
def __init__(self):
3558
statsd.Connection.set_defaults(host=cfg.CONF.metrics.host, port=cfg.CONF.metrics.port,
3659
sample_rate=cfg.CONF.metrics.sample_rate)
3760

61+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
62+
level=stdlib_logging.WARNING)
3863
def time(self, key, time):
3964
"""
4065
Timer metric
@@ -46,6 +71,8 @@ def time(self, key, time):
4671
timer = statsd.Timer('')
4772
timer.send(key, time)
4873

74+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
75+
level=stdlib_logging.WARNING)
4976
def inc_counter(self, key, amount=1):
5077
"""
5178
Increment counter
@@ -57,6 +84,8 @@ def inc_counter(self, key, amount=1):
5784
counter = statsd.Counter(key)
5885
counter.increment(delta=amount)
5986

87+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
88+
level=stdlib_logging.WARNING)
6089
def dec_counter(self, key, amount=1):
6190
"""
6291
Decrement metric
@@ -68,6 +97,8 @@ def dec_counter(self, key, amount=1):
6897
counter = statsd.Counter(key)
6998
counter.decrement(delta=amount)
7099

100+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
101+
level=stdlib_logging.WARNING)
71102
def set_gauge(self, key, value):
72103
"""
73104
Set gauge value.
@@ -79,6 +110,8 @@ def set_gauge(self, key, value):
79110
gauge = statsd.Gauge(key)
80111
gauge.send(None, value)
81112

113+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
114+
level=stdlib_logging.WARNING)
82115
def inc_gauge(self, key, amount=1):
83116
"""
84117
Increment gauge value.
@@ -90,6 +123,8 @@ def inc_gauge(self, key, amount=1):
90123
gauge = statsd.Gauge(key)
91124
gauge.increment(None, amount)
92125

126+
@ignore_and_log_exception(exc_classes=NON_FATAL_EXC_CLASSES, logger=LOG,
127+
level=stdlib_logging.WARNING)
93128
def dec_gauge(self, key, amount=1):
94129
"""
95130
Decrement gauge value.

st2common/st2common/util/misc.py

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,22 @@
1515

1616
from __future__ import absolute_import
1717

18+
import logging
19+
1820
import os
1921
import re
2022
import sys
2123
import collections
24+
import functools
2225

2326
import six
2427

2528
__all__ = [
2629
'prefix_dict_keys',
2730
'compare_path_file_name',
2831
'lowercase_value',
29-
'get_field_name_from_mongoengine_error'
32+
'get_field_name_from_mongoengine_error',
33+
3034
]
3135

3236

@@ -168,3 +172,28 @@ def get_field_name_from_mongoengine_error(exc):
168172
return match.groups()[0]
169173

170174
return msg
175+
176+
177+
def ignore_and_log_exception(exc_classes=(Exception,), logger=None, level=logging.WARNING):
178+
"""
179+
Decorator which catches the provided exception classes and logs them instead of letting them
180+
bubble all the way up.
181+
"""
182+
exc_classes = tuple(exc_classes)
183+
184+
def decorator(func):
185+
@functools.wraps(func)
186+
def wrapper(*args, **kwargs):
187+
try:
188+
return func(*args, **kwargs)
189+
except exc_classes as e:
190+
if len(args) >= 1 and getattr(args[0], '__class__', None):
191+
func_name = '%s.%s' % (args[0].__class__.__name__, func.__name__)
192+
else:
193+
func_name = func.__name__
194+
195+
message = ('Exception in fuction "%s": %s' % (func_name, str(e)))
196+
logger.log(level, message)
197+
198+
return wrapper
199+
return decorator

st2common/tests/unit/test_metrics.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,9 +12,13 @@
1212
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1313
# See the License for the specific language governing permissions and
1414
# limitations under the License.
15-
from datetime import datetime, timedelta
15+
16+
import socket
17+
from datetime import datetime
18+
from datetime import timedelta
1619

1720
import unittest2
21+
import mock
1822
from mock import patch, MagicMock
1923

2024
from oslo_config import cfg
@@ -200,6 +204,52 @@ def test_get_full_key_name(self):
200204
result = get_full_key_name('api.requests')
201205
self.assertEqual(result, 'st2.prod.api.requests')
202206

207+
@patch('st2common.metrics.drivers.statsd_driver.LOG')
208+
@patch('st2common.metrics.drivers.statsd_driver.statsd')
209+
def test_driver_socket_exceptions_are_not_fatal(self, statsd, mock_log):
210+
# Socket errors such as DNS resolution errors should be considered non fatal and ignored
211+
mock_logger = mock.Mock()
212+
StatsdDriver.logger = mock_logger
213+
214+
# 1. timer
215+
mock_timer = MagicMock(side_effect=socket.error('error 1'))
216+
statsd.Timer('').send.side_effect = mock_timer
217+
params = ('test', 10)
218+
self._driver.time(*params)
219+
statsd.Timer('').send.assert_called_with('st2.test', 10)
220+
221+
# 2. counter
222+
key = 'test'
223+
mock_counter = MagicMock(side_effect=socket.error('error 2'))
224+
statsd.Counter(key).increment.side_effect = mock_counter
225+
self._driver.inc_counter(key)
226+
mock_counter.assert_called_once_with(delta=1)
227+
228+
key = 'test'
229+
mock_counter = MagicMock(side_effect=socket.error('error 3'))
230+
statsd.Counter(key).decrement.side_effect = mock_counter
231+
self._driver.dec_counter(key)
232+
mock_counter.assert_called_once_with(delta=1)
233+
234+
# 3. gauge
235+
params = ('key', 100)
236+
mock_gauge = MagicMock(side_effect=socket.error('error 4'))
237+
statsd.Gauge().send.side_effect = mock_gauge
238+
self._driver.set_gauge(*params)
239+
mock_gauge.assert_called_once_with(None, params[1])
240+
241+
params = ('key1',)
242+
mock_gauge = MagicMock(side_effect=socket.error('error 5'))
243+
statsd.Gauge().increment.side_effect = mock_gauge
244+
self._driver.inc_gauge(*params)
245+
mock_gauge.assert_called_once_with(None, 1)
246+
247+
params = ('key1',)
248+
mock_gauge = MagicMock(side_effect=socket.error('error 6'))
249+
statsd.Gauge().decrement.side_effect = mock_gauge
250+
self._driver.dec_gauge(*params)
251+
mock_gauge.assert_called_once_with(None, 1)
252+
203253

204254
class TestCounterContextManager(unittest2.TestCase):
205255
@patch('st2common.metrics.base.METRICS')

0 commit comments

Comments
 (0)