tpotQueryAPI/libQueryTpot.py at main · lsp-cyber/tpotQueryAPI · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import logging
from datetime import datetime
from elasticsearch import Elasticsearch, exceptions
from collections import Counter
from typing import List, Dict
from tqdm import tqdm

# Configure logging
logging.basicConfig(
    filename='./pullTPOTDataForOTX.log',
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

class TPotQuery:
    """
    A class to query TPOT Elasticsearch data, excluding noisy sources,
    and collect recent logs for further processing (e.g., STIX bundle creation).
    """

    def __init__(self, es_host, index_name, api_key_id, api_key, batch_size=1000, scroll_time='2m', config=None):
        """
        Initialize the TPotQuery class.

        Args:
            es_host (str): Elasticsearch server hostname or IP.
            index_name (str): Elasticsearch index to query.
            api_key_id (str): Elasticsearch API key ID.
            api_key (str): Elasticsearch API key value.
            batch_size (int): Number of documents to fetch per batch.
            scroll_time (str): Scroll context retention duration.
            config (dict, optional): Additional configuration if needed.
        """
        self.index_name = index_name
        self.batch_size = batch_size
        self.scroll_time = scroll_time
        self.config = config
        self.ignore_types = config['tpot']['ignore_types']
        self.results = []

        try:
            self.es = Elasticsearch(
                hosts=[es_host],
                api_key=(api_key_id, api_key),
                verify_certs=False
            )
        except exceptions.ConnectionError as e:
            logging.error(f"Elasticsearch connection error: {e}")
            raise

    def pull_recent_logs(self, minutes_back=5):
        """
        Pull logs from Elasticsearch within the past `minutes_back` minutes,
        excluding types in the ignore list.

        Args:
            minutes_back (int): Number of minutes to look back from current time.

        Returns:
            list: All matching documents retrieved from Elasticsearch.
        """
        time_range = {
            "range": {
                "@timestamp": {
                    "gte": f"now-{minutes_back}m",
                    "lte": "now"
                }
            }
        }

        exclude_types = {
            "terms": {
                "type": self.ignore_types
            }
        }

        query_body = {
            "query": {
                "bool": {
                    "must": [time_range],
                    "must_not": [exclude_types]
                }
            }
        }

        logging.info(f"Querying Elasticsearch index '{self.index_name}' for recent data...")
        try:
            response = self.es.search(
                index=self.index_name,
                body=query_body,
                scroll=self.scroll_time,
                size=self.batch_size
            )

            scroll_id = response.get('_scroll_id')
            hits = response['hits']['hits']

            while hits:
                self.results.extend(hits)
                response = self.es.scroll(scroll_id=scroll_id, scroll=self.scroll_time)
                scroll_id = response.get('_scroll_id')
                hits = response['hits']['hits']

            self.es.clear_scroll(scroll_id=scroll_id)
            logging.info(f"Retrieved {len(self.results)} records from '{self.index_name}'.")

        except Exception as e:
            logging.error("Failed to retrieve logs from Elasticsearch", exc_info=True)

        return self.results

    def show_results(self):
        count=1
        for item in self.results:
            print (f'[{count}] {item["_source"]['type']} {item["_source"]['src_ip']} : {item["_source"]}')
            count += 1

    def summarize_type_field(self, entries: List[Dict]) -> Dict[str, int]:
        """
        Summarizes the count of each unique 'type' field found within the '_source' key
        of a list of Elasticsearch-style entries.

        Args:
            entries (List[Dict]): List of log entries with '_source' containing the 'type' field.

        Returns:
            Dict[str, int]: Dictionary mapping each unique type to its count.
        """
        type_counts = Counter()

        for entry in entries:
            source = entry.get('_source', {})
            type_value = source.get('type', 'MISSING')
            type_counts[type_value] += 1

        print(f"Total unique types: {len(type_counts)}")
        for t, count in type_counts.items():
            print(f"Type: {t:<20} Count: {count}")

        return dict(type_counts)

    def summarize_credentials(self, entries):
        """
        Summarizes username/password combinations from a list of log entries.

        Args:
            entries (list): List of Elasticsearch-style dicts with '_source' keys.

        Returns:
            dict: Dictionary with (username, password) tuples as keys and their occurrence counts as values.
        """
        cred_counter = Counter()

        for entry in entries:
            source = entry.get('_source', {})
            username = source.get('username')
            password = source.get('password')

            if username is not None and password is not None:
                cred_counter[(username, password)] += 1

        print(f"Total unique username/password pairs: {len(cred_counter)}")
        for (user, pwd), count in cred_counter.items():
            print(f"Username: {user:<20} Password: {pwd:<20} Count: {count}")

        return dict(cred_counter)

    def summarize_hashes(self, entries):
        """
        Summarizes unique file hashes (from the 'shasum' field) in the list of log entries.

        Args:
            entries (list): List of Elasticsearch-style dicts with '_source' keys.

        Returns:
            dict: Dictionary with each unique shasum and its occurrence count.
        """
        hash_counter = Counter()

        for entry in entries:
            source = entry.get('_source', {})
            shasum = source.get('shasum')

            if shasum:
                hash_counter[shasum] += 1

        print(f"Total unique file hashes (shasum): {len(hash_counter)}")
        for shasum, count in hash_counter.items():
            print(f"SHA Sum: {shasum:<64} Count: {count}")

        return dict(hash_counter)

    from collections import Counter

    def summarize_inputs(self, entries):
        """
        Summarizes the count of each unique value in the 'input' field from the log entries.

        Args:
            entries (list): List of Elasticsearch-style dicts with '_source' keys.

        Returns:
            dict: Dictionary with each unique input value and its count.
        """
        input_counter = Counter()

        for entry in entries:
            source = entry.get('_source', {})
            input_value = source.get('input')

            if input_value:
                input_counter[input_value] += 1

        print(f"Total unique input values: {len(input_counter)}")
        for value, count in input_counter.items():
            print(f"Input: {value:<20} Count: {count}")

        return dict(input_counter)