From b075bc638abf6f222d2077511cb718f8794f7120 Mon Sep 17 00:00:00 2001 From: Bob Date: Sat, 21 Oct 2017 13:49:35 +0000 Subject: [PATCH] Fixed ISO timestamp parsing to be consistent. Documented model. --- README.md | 33 ++++++++++++++++++++++++ busybody.py | 2 +- flatfile/flatfile.py | 60 +++++++++++++++++++++++++++----------------- gsuite/gsuite.py | 6 ++--- slack/slack.py | 6 ++--- 5 files changed, 77 insertions(+), 30 deletions(-) diff --git a/README.md b/README.md index 0424ccd..b5a9ac0 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,19 @@ As noted above, each of those should be a standard YAML dictionary, an example o There are a few lower-level configuration options of special note that will not be covered in the per-module READMEs as they apply either to the system as a whole, or to multiple types of modules. These are: +> enabled + +This is a dummy dictionary entry that may be provided within any module that needs no further configuration, but that needs to be listed in a particular section. Any module that you wish to poll from **must** be listed inside the "pollers" top-level dictionary. Similarly, any module that you wish to perform analysis on **must** be listed inside of the "analysis" top-level dictionary. + + +> geoip + +This dictionary inside of the "analysis" top-level dictionary should contain two entries that point to databases provided by MaxMind. The "city\_db" entry should be a MaxMind city-resolution database. The "asn\_db" should be a MaxMind IP-\>ASN database. Both are freely available from [MaxMind's site](http://dev.maxmind.com/geoip/geoip2/geolite2/). + +> history\_limit + +This is a string entry within the "analysis" top-level dictionary. It represents the amount of time backwards (in seconds) that should be included in each analysis run. This can be very useful to allow trends to age out and to prevent the unbounded growth of the analysis application as you accumulate events. + > user\_domain This is a string entry within any module dictionary inside of the "analysis" top-level dictionary. This string provides a domain to append to the user names from the module to convert them into email-style strings. NOTE: This is a blunt tool that will be insufficient for many cases. It is applied prior to the below "user\_map", however, so it may be useful for an initial pass with later corrections. Generally, it is preferable to already have emails in the logs as they serve as a consistent cross-service user identifier. @@ -76,6 +89,26 @@ As a note, the output of this program will be the result of statistical tests ru Interpretation of the output may require an analyst to review other entries from the user that has been flagged in order to determine the cause of the flag. Please bear that in mind and only take action against a flagged user account if further investigation shows that such action is merited. Just like an actual neighborhood watch, just taking reports at face value may lead to undesirable outcomes. + +## Documentation of the model + +This machine learning model uses an isolation forest as the final decision function because of its parameter-free nature, and its well-recognized performance in higher-dimensional data (O(n) time and O(1) space), which may occur when parsing text as we are. Each user gets assigned their own model to ensure that users with less concrete clusters of activity don't mask anomalous behavior of those who have more tightly clustered activity. + +There are four components at this time going into the final decision function. These are: + +### Device Identifier + +This is primarily the user-agent, which is being run through the term frequency inverse-document frequency (tf-idf) vectorizer. Ideally we would have both user agent and some form of other "device" identifier (like a random UUID) so that we could assess whether a new connection was really from a new device or from a device that we know. Unfortunately, none of the existing modules expose this information (and, in fact, only Slack is exposing the user-agent). + +### IP Geolocation + +One piece of data that we attempt to gather from the IP is the geolocation, using databases from MaxMind. This information may not always be accurate, but can provide a helpful clue in many cases about the likelihood that a connection is part of the user's normal patterns. + +### IP ASN Organization + +The other piece of the puzzle that comes from the IP is the owning ASN's organization name. This allows the model to correlate different networks (that may geolocate to different locations) that are owned by the same group. For example, many mobile network providers have numerous ranges that they may assign IPs from and these ranges are often assigned at the national level, so even remaining within a limited geographical area may not guarantee an IP that geolocates to that area. This field is also run through tf-idf to turn the text into useful numbers, which allows us to account for minor differences is organization name across ASNs belonging to larger organizations. + + ## Changelog * 1.0 - Initial public release diff --git a/busybody.py b/busybody.py index c148848..b868454 100755 --- a/busybody.py +++ b/busybody.py @@ -71,7 +71,7 @@ def preprocess(config, data): not event[user_field] or ip_field not in event or not event[ip_field] or \ ua_field not in event or not event[ua_field]: continue - if type(event[ts_field]) == str: + if type(event[ts_field]) == str and "T" in event[ts_field]: ts = datetime.timestamp(datetime.strptime(event[ts_field], '%Y-%m-%dT%H:%M:%S.%fZ')) else: ts = event[ts_field] diff --git a/flatfile/flatfile.py b/flatfile/flatfile.py index dc5bd94..226922b 100644 --- a/flatfile/flatfile.py +++ b/flatfile/flatfile.py @@ -2,6 +2,7 @@ import sys import json import logging from pathlib import Path +from datetime import datetime logger = logging.getLogger(__name__) @@ -11,24 +12,31 @@ def get_last(config): raise RuntimeError("Flat file persistence requested, but no log_directory specified.") log_dir = Path(config["persistence"]["log_directory"]) log_dir.mkdir(mode=0o775, parents=True, exist_ok=True) + modules = set() if "pollers" in config and config["pollers"]: - for module in config["active_modules"]["pollers"]: - poll_mod = getattr(sys.modules[module], module) - ts_field = poll_mod.TIMESTAMP_FIELD - log_file = log_dir / (module + ".log") - log_file.touch(mode=0o660, exist_ok=True) - last = "" - with log_file.open('r') as f: - for line in f: - if len(line) > 3: - last = line - if last: - last_event = json.loads(last) - config["pollers"][module]["last_polled_time"] = last_event[ts_field] - config["pollers"][module]["last_polled_event"] = last_event - else: - config["pollers"][module]["last_polled_time"] = 0 - config["pollers"][module]["last_polled_event"] = {} + modules.update(config["active_modules"]["pollers"]) + if "analysis" in config and config["analysis"]: + modules.update(config["active_modules"]["analysis"]) + for module in modules: + config_mod = getattr(sys.modules[module], module) + ts_field = config_mod.TIMESTAMP_FIELD + log_file = log_dir / (module + ".log") + log_file.touch(mode=0o660, exist_ok=True) + last = "" + with log_file.open('r') as f: + for line in f: + if len(line) > 3: + last = line + if "last_polled" not in config: + config["last_polled"] = {} + config["last_polled"][module] = {} + if last: + last_event = json.loads(last) + config["last_polled"][module]["last_polled_time"] = last_event[ts_field] + config["last_polled"][module]["last_polled_event"] = last_event + else: + config["last_polled"][module]["last_polled_time"] = 0 + config["last_polled"][module]["last_polled_event"] = {} return config @@ -51,14 +59,17 @@ def get_historical_data(config): raise RuntimeError("Flat file persistence requested, but no log_directory specified.") log_dir = Path(config["persistence"]["log_directory"]) log_dir.mkdir(mode=0o775, parents=True, exist_ok=True) - if "pollers" not in config: + if "analysis" not in config: return data - for module in config["active_modules"]["pollers"]: + for module in config["active_modules"]["analysis"]: data[module] = [] - poll_mod = getattr(sys.modules[module], module) - ts_field = poll_mod.TIMESTAMP_FIELD + analysis_mod = getattr(sys.modules[module], module) + ts_field = analysis_mod.TIMESTAMP_FIELD if "history_limit" in config: - limit = max(0, config["pollers"][module]["last_polled_time"] - config["history_limit"]) + last_time = config["last_polled"][module]["last_polled_time"] + if type(last_time) == str and "T" in last_time: + last_time = datetime.timestamp(datetime.strptime(last_time, '%Y-%m-%dT%H:%M:%S.%fZ')) + limit = max(0, last_time - config["history_limit"]) else: limit = 0 log_file = log_dir / (module + ".log") @@ -66,7 +77,10 @@ def get_historical_data(config): with log_file.open('r') as f: for line in f: event = json.loads(line) - if str(event[ts_field]) >= str(limit): + timestamp = event[ts_field] + if type(timestamp) == str and "T" in timestamp: + timestamp = datetime.timestamp(datetime.strptime(timestamp, '%Y-%m-%dT%H:%M:%S.%fZ')) + if str(timestamp) >= str(limit): data[module].append(event) return data diff --git a/gsuite/gsuite.py b/gsuite/gsuite.py index b00aae1..51bed84 100644 --- a/gsuite/gsuite.py +++ b/gsuite/gsuite.py @@ -27,10 +27,10 @@ def poll(config): activities = results.get('items', []) for event in activities: flattened = flatten(event) - if flattened[TIMESTAMP_FIELD] > str(config["pollers"]["gsuite"]["last_polled_time"]): + if flattened[TIMESTAMP_FIELD] > str(config["last_polled"]["gsuite"]["last_polled_time"]): data.append(flattened) - elif flattened[TIMESTAMP_FIELD] == str(config["pollers"]["gsuite"]["last_polled_time"]): - if flattened["id.uniqueQualifier"] == config["pollers"]["gsuite"]["last_polled_event"]["id.uniqueQualifier"]: + elif flattened[TIMESTAMP_FIELD] == str(config["last_polled"]["gsuite"]["last_polled_time"]): + if flattened["id.uniqueQualifier"] == config["last_polled"]["gsuite"]["last_polled_event"]["id.uniqueQualifier"]: caught_up = True break else: diff --git a/slack/slack.py b/slack/slack.py index 4e69ae3..8ae9f19 100644 --- a/slack/slack.py +++ b/slack/slack.py @@ -19,10 +19,10 @@ def poll(config): api_data = slack_api.api_call("team.accessLogs", count=1000, page=i) check_api(api_data) for event in api_data["logins"]: - if event[TIMESTAMP_FIELD] > config["pollers"]["slack"]["last_polled_time"]: + if event[TIMESTAMP_FIELD] > config["last_polled"]["slack"]["last_polled_time"]: data.append(event) - elif event[TIMESTAMP_FIELD] == config["pollers"]["slack"]["last_polled_time"]: - if str(event) == str(config["pollers"]["slack"]["last_polled_event"]): + elif event[TIMESTAMP_FIELD] == config["last_polled"]["slack"]["last_polled_time"]: + if str(event) == str(config["last_polled"]["slack"]["last_polled_event"]): caught_up = True break else: