Skip to content

Commit 74f90df

Browse files
committed
Implementing extractStructuralTokens as a helper to detection engine
1 parent f932a3f commit 74f90df

6 files changed

Lines changed: 87 additions & 7 deletions

File tree

data/txt/sha256sums.txt

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -162,13 +162,13 @@ df768bcb9838dc6c46dab9b4a877056cb4742bd6cfaaf438c4a3712c5cc0d264 extra/shutils/
162162
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 extra/vulnserver/__init__.py
163163
617cec1b731e0baacafa6f58c2f56a85b6128d1416627cc1b2f61519c8539a2e extra/vulnserver/vulnserver.py
164164
a2bf70d7f87c3a4e0675c0bad54119a4e04efa6ea2730a8338d5aebcd995630e lib/controller/action.py
165-
9137a8f7368496c84b21944f6b94c28004d3a2a849ac9c8e0b20e294e4c4a93a lib/controller/checks.py
165+
f4fb3839e5accd1b58b34226e4b26f5079d9696e24d335d37d870cd5e62d1e80 lib/controller/checks.py
166166
666935b658074dc9c42153622b75d4ec7bfe56fbe0742de827a5d30a1a0f9d96 lib/controller/controller.py
167167
d69e84f1648cdb907f5d2dd454f03874a4613752b07867510145d51d84b3c56f lib/controller/handler.py
168168
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 lib/controller/__init__.py
169169
9c5764c92ce536d1f0f96200359ee5ef1f37f9128769bf990cb77f1d1f8e17b1 lib/core/agent.py
170170
c51c33501cc905586a9aaac93b06f2ac6f71628d032a7dc39fd0ef05d7ee3856 lib/core/bigarray.py
171-
122767794156afa41b19baa706ad4c124eef6eaf73ed8fd208d8f634e97e82eb lib/core/common.py
171+
d143df718fbaacb617b6046c73cf4e47932e1a25928a4e1ecb87ea77a3b154ed lib/core/common.py
172172
8f1272487e1adfcc8c755a2f56f0c6d21eac5e685a73a9a159482f9dc9142bc5 lib/core/compat.py
173173
a683d0ad9ba543587382c4903d28db610ae20394fcf9045a68b2ab54a39381ae lib/core/convert.py
174174
c03dc585f89642cfd81b087ac2723e3e1bb3bfa8c60e6f5fe58ef3b0113ebfe6 lib/core/data.py
@@ -182,14 +182,14 @@ f8de57606325456928e46ae2896f5f8bbec9ad18b1c644b492a566fa992216f6 lib/core/decor
182182
1966ca704961fb987ab757f0a4afddbf841d1a880631b701487c75cef63d60c3 lib/core/__init__.py
183183
914a13ee21fd610a6153a37cbe50830fcbd1324c7ebc1e7fc206d5e598b0f7ad lib/core/log.py
184184
5a576f802f1298d0aa357e766ae6502fa53cacbbe0b1d328b7410a8b20a885b2 lib/core/optiondict.py
185-
e033b20a0f7821797a10f4bf4235723f38c7db551c611fbb713faa621b123c4a lib/core/option.py
185+
98d3d61278794705c7039e40fab66a626e8d6ab765383c5379cec7a066b09301 lib/core/option.py
186186
21b2b1745107c211fc7593923a3da7a808d40763c00091c28de5f7c129bcf3bc lib/core/patch.py
187187
49c0fa7e3814dfda610d665ee02b12df299b28bc0b6773815b4395514ddf8dec lib/core/profiling.py
188188
0c36a65b6237732eb001d333f80f0c58c088ff01ae80cf07e4dcc6da2a806364 lib/core/readlineng.py
189189
9bf174058f15d14e24e94f9aaf42df045119d3617c6c54bd2f3af79b462f331d lib/core/replication.py
190190
0b8c38a01bb01f843d94a6c5f2075ee47520d0c4aa799cecea9c3e2c5a4a23a6 lib/core/revision.py
191191
888daba83fd4a34e9503fe21f01fef4cc730e5cde871b1d40e15d4cbc847d56c lib/core/session.py
192-
098e5d86a0da05d4be5f5ed5371083954be2369abce57fda4bd906d12e1f8870 lib/core/settings.py
192+
a2fb281b59c4526613f22fc0e994b68db91c1263db415aa86002ec4e20773639 lib/core/settings.py
193193
c7804223319e18eb0b8e2cbf0a8b6896d1cefb7b0b1a2e9f1cf826a8a3b56750 lib/core/shell.py
194194
a2e98a94b231432736d6b304fc75525c8b5fdb4768c418387c5b4c1a610dad64 lib/core/subprocessng.py
195195
19f1e3c5e3ba703d28d510cd7a9ab8284d5fbe9df5ce7e77c86e5931571364b7 lib/core/target.py
@@ -211,7 +211,7 @@ c2f34e27578742e729c2fa9c1d4f0a0d8f8f7f4cf0fc14c62ec817a260c71dec lib/parse/site
211211
1be3da334411657461421b8a26a0f2ff28e1af1e28f1e963c6c92768f9b0847c lib/request/basicauthhandler.py
212212
369484a2999d29f49bf839a329d1686ed94f6ea27c695e027fe08c8da51f30a3 lib/request/basic.py
213213
bc61bc944b81a7670884f82231033a6ac703324b34b071c9834886a92e249d0e lib/request/chunkedhandler.py
214-
d4bb0869b03602a0c8f9e0e0fd217753f14ddadf848fc9f3c65a74d03feb9958 lib/request/comparison.py
214+
9c0dccc1cee66d38478aaf75a7c513d0d136d50a90b15fed146faa1653899fe1 lib/request/comparison.py
215215
729e07a2ca6b1d83563e9c6dc5a884d1b664c1764be06776ea93bde305164f0c lib/request/connect.py
216216
8e06682280fce062eef6174351bfebcb6040e19976acff9dc7b3699779783498 lib/request/direct.py
217217
a6b37b436838caeb197fea858d0a39fadbff4736256e741b5fcec1f28fcf1ce0 lib/request/dns.py

lib/controller/checks.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
from lib.core.agent import agent
1717
from lib.core.common import Backend
1818
from lib.core.common import extractRegexResult
19+
from lib.core.common import extractStructuralTokens
1920
from lib.core.common import extractTextTagContent
2021
from lib.core.common import filterNone
2122
from lib.core.common import findDynamicContent
@@ -1390,7 +1391,26 @@ def checkStability():
13901391
raise SqlmapNoneDataException(errMsg)
13911392

13921393
else:
1393-
checkDynamicContent(firstPage, secondPage)
1394+
# Before engaging the (lossy) dynamic-content removal / '--text-only' escalation, check
1395+
# whether the page is structurally stable (identical tag/class/id skeleton across the two
1396+
# requests) despite differing text. If so, base the comparison on that value-free structure
1397+
# so that dynamic content (e.g. per-render result rows) does not mask an injection. This is
1398+
# the HTML counterpart of the structure-aware JSON comparison
1399+
if firstPage and secondPage and extractStructuralTokens(firstPage) == extractStructuralTokens(secondPage):
1400+
kb.pageStructurallyStable = True
1401+
1402+
if kb.nullConnection:
1403+
debugMsg = "turning off NULL connection "
1404+
debugMsg += "support because of structural page comparison"
1405+
logger.debug(debugMsg)
1406+
1407+
kb.nullConnection = None
1408+
1409+
infoMsg = "target URL content is not byte-stable but structurally stable; sqlmap "
1410+
infoMsg += "will base the page comparison on the page structure"
1411+
logger.info(infoMsg)
1412+
else:
1413+
checkDynamicContent(firstPage, secondPage)
13941414

13951415
return kb.pageStable
13961416

lib/core/common.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,9 @@
176176
from lib.core.settings import SENSITIVE_DATA_REGEX
177177
from lib.core.settings import SENSITIVE_OPTIONS
178178
from lib.core.settings import STDIN_PIPE_DASH
179+
from lib.core.settings import STRUCTURAL_CLASS_REGEX
180+
from lib.core.settings import STRUCTURAL_ID_REGEX
181+
from lib.core.settings import STRUCTURAL_TAG_REGEX
179182
from lib.core.settings import SUPPORTED_DBMS
180183
from lib.core.settings import TEXT_TAG_REGEX
181184
from lib.core.settings import TIME_STDEV_COEFF
@@ -3227,6 +3230,45 @@ def extractTextTagContent(page):
32273230

32283231
return filterNone(_.group("result").strip() for _ in re.finditer(TEXT_TAG_REGEX, page))
32293232

3233+
def extractStructuralTokens(page):
3234+
"""
3235+
Returns a set of value-free structural tokens (tag names and class/id attribute hooks) of a
3236+
(HTML) page, discarding all textual content. Used for structure-aware page comparison when the
3237+
page is byte-unstable but structurally stable (e.g. dynamic result rows in a fixed layout), so
3238+
that dynamic text does not perturb the comparison while a structural change (e.g. a results
3239+
table appearing or disappearing) still does. HTML counterpart of jsonMinimize()
3240+
3241+
>>> sorted(extractStructuralTokens(u'<div id="g" class="a b"><span>x</span></div>')) == [u'cls:div.a', u'cls:div.b', u'id:div#g', u'tag:div', u'tag:span']
3242+
True
3243+
>>> extractStructuralTokens(u'<table><tr><td>1</td></tr></table>') == set([u'tag:table', u'tag:tr', u'tag:td'])
3244+
True
3245+
>>> extractStructuralTokens(u'') == set()
3246+
True
3247+
"""
3248+
3249+
page = page or ""
3250+
3251+
if REFLECTED_VALUE_MARKER in page:
3252+
page = re.sub(r"(?i)<[^>]*%s[^>]*>" % REFLECTED_VALUE_MARKER, " ", page)
3253+
3254+
page = re.sub(r"(?si)<script.+?</script>|<!--.+?-->|<style.+?</style>", " ", page)
3255+
3256+
retVal = set()
3257+
3258+
for match in re.finditer(STRUCTURAL_TAG_REGEX, page):
3259+
tag = match.group(1).lower()
3260+
attrs = match.group(2) or ""
3261+
retVal.add("tag:%s" % tag)
3262+
for _ in re.finditer(STRUCTURAL_CLASS_REGEX, attrs):
3263+
for value in (_.group(1) or _.group(2) or _.group(3) or "").split():
3264+
retVal.add("cls:%s.%s" % (tag, value))
3265+
for _ in re.finditer(STRUCTURAL_ID_REGEX, attrs):
3266+
value = (_.group(1) or _.group(2) or _.group(3) or "").strip()
3267+
if value:
3268+
retVal.add("id:%s#%s" % (tag, value))
3269+
3270+
return retVal
3271+
32303272
def trimAlphaNum(value):
32313273
"""
32323274
Trims alpha numeric characters from start and ending of a given value

lib/core/option.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2210,6 +2210,7 @@ def _setKnowledgeBaseAttributes(flushAll=True):
22102210
kb.pageTemplates = dict()
22112211
kb.pageEncoding = DEFAULT_PAGE_ENCODING
22122212
kb.pageStable = None
2213+
kb.pageStructurallyStable = None
22132214
kb.partRun = None
22142215
kb.permissionFlag = False
22152216
kb.place = None

lib/core/settings.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
from thirdparty import six
2121

2222
# sqlmap version (<major>.<minor>.<month>.<monthly commit>)
23-
VERSION = "1.10.6.198"
23+
VERSION = "1.10.6.199"
2424
TYPE = "dev" if VERSION.count('.') > 2 and VERSION.split('.')[-1] != '0' else "stable"
2525
TYPE_COLORS = {"dev": 33, "stable": 90, "pip": 34}
2626
VERSION_STRING = "sqlmap/%s#%s" % ('.'.join(VERSION.split('.')[:-1]) if VERSION.count('.') > 2 and VERSION.split('.')[-1] == '0' else VERSION, TYPE)
@@ -180,6 +180,13 @@
180180
# Regular expression used for extracting content from "textual" tags
181181
TEXT_TAG_REGEX = r"(?si)<(abbr|acronym|b|blockquote|br|center|cite|code|dt|em|font|h[1-6]|i|li|p|pre|q|strong|sub|sup|td|th|title|tt|u)(?!\w).*?>(?P<result>[^<]+)"
182182

183+
# Regular expressions used for extracting a value-free structural skeleton of a (HTML) page (tag
184+
# names and class/id attribute hooks), for structure-aware comparison of pages whose textual
185+
# content is dynamic but whose layout is stable
186+
STRUCTURAL_TAG_REGEX = r"(?si)<\s*([a-z][a-z0-9]*)((?:\s+[^<>]*)?)/?>"
187+
STRUCTURAL_CLASS_REGEX = r"""(?si)\bclass\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
188+
STRUCTURAL_ID_REGEX = r"""(?si)\bid\s*=\s*(?:"([^"]*)"|'([^']*)'|([^\s"'<>]+))"""
189+
183190
# Regular expression used for recognition of IP addresses
184191
IP_ADDRESS_REGEX = r"\b(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\b"
185192

lib/request/comparison.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import re
1111

1212
from lib.core.common import extractRegexResult
13+
from lib.core.common import extractStructuralTokens
1314
from lib.core.common import getFilteredPageContent
1415
from lib.core.common import jsonMinimize
1516
from lib.core.common import listToStrValue
@@ -177,6 +178,15 @@ def _comparison(page, headers, code, getRatioValue, pageLength):
177178
seq1 = jsonMinimize(kb.pageTemplate)
178179
seq2 = jsonMinimize(rawPage)
179180

181+
# Structure-aware comparison for a structurally-stable (but byte-unstable) HTML page:
182+
# compare the value-free tag/class/id skeleton so dynamic text does not perturb the ratio
183+
# while a structural change (e.g. a results table appearing/disappearing) still does
184+
if seq1 is None and kb.pageStructurallyStable and not (conf.titles or conf.textOnly or kb.nullConnection):
185+
_ = "\n".join(sorted(extractStructuralTokens(kb.pageTemplate)))
186+
if _: # only engage when the page actually exposes structure (HTML tags); tagless content falls back to text
187+
seq1 = _
188+
seq2 = "\n".join(sorted(extractStructuralTokens(rawPage)))
189+
180190
if seq1 is None or seq2 is None:
181191
if conf.titles:
182192
seq1 = extractRegexResult(HTML_TITLE_REGEX, seqMatcher.a)

0 commit comments

Comments
 (0)