Skip to content

Commit 9aeb587

Browse files
committed
add selector query engine, a modified version of q.nim
1 parent 9438d5f commit 9aeb587

File tree

4 files changed

+317
-20
lines changed

4 files changed

+317
-20
lines changed

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,20 @@ This module written for *compile time* XML parsing purpose, it supports only som
1111
> The parser is simple and small, no error checking/correcting. Use it as your own risk*
1212
1313
If you need a more powerful XML/HTML parser, consider using [parsexml](https://nim-lang.org/docs/parsexml.html)
14+
15+
16+
This module contains a modified version of my [q.nim](https://github.com/OpenSystemsLab/q.nim) module, named `selector`.
17+
18+
Just import `xml/selector` to use it
19+
20+
### Usage:
21+
```nim
22+
import xml, xml/selector
23+
24+
var d = q($readFile("test.html"))
25+
26+
27+
assert d.select("head *").len == 2
28+
echo d.select("head *")
29+
30+
```

src/xml/selector.nim

Lines changed: 222 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,222 @@
1+
import pegs, strutils, ../xml
2+
from streams import newStringStream
3+
from strtabs import hasKey
4+
5+
6+
let
7+
attribute = r"[a-zA-Z][a-zA-Z0-9_\-]*"
8+
classes = r"{\.[a-zA-Z0-9_][a-zA-Z0-9_\-]*}"
9+
attributes = r"{\[" & attribute & r"\s*([\*\^\$\~]?\=\s*[\'""]?(\s*\ident\s*)+[\'""]?)?\]}"
10+
pselectors = peg(r"\s*{\ident}?({'#'\ident})? (" & classes & ")* " & attributes & "*")
11+
pattributes = peg(r"{\[{" & attribute & r"}\s*({[\*\^\$\~]?}\=\s*[\'""]?{(\s*\ident\s*)+}[\'""]?)?\]}")
12+
13+
type
14+
Attribute = object
15+
name: string
16+
operator: char
17+
value: string
18+
19+
Selector = object
20+
combinator: char
21+
tag: string
22+
id: string
23+
classes: seq[string]
24+
attributes: seq[Attribute]
25+
26+
QueryContext = object
27+
root: seq[XmlNode]
28+
29+
proc newSelector(tag, id = "", classes: seq[string] = @[], attributes: seq[Attribute] = @[]): Selector =
30+
result.combinator = ' '
31+
result.tag = tag
32+
result.id = id
33+
result.classes = classes
34+
result.attributes = attributes
35+
36+
proc initContext(root: seq[XmlNode]): QueryContext =
37+
result.root = root
38+
39+
proc initContext(root: XmlNode): QueryContext =
40+
initContext(@[root])
41+
42+
proc newAttribute(n, o, v: string): Attribute =
43+
result.name = n
44+
45+
if o.len != 0:
46+
result.operator = o[0]
47+
result.value = v
48+
49+
proc q*(n: XmlNode): QueryContext =
50+
## Init Q context from single parent node
51+
initContext(n)
52+
53+
proc q*(n: seq[XmlNode]): QueryContext =
54+
## Init Q context from parent nodes
55+
initContext(n)
56+
57+
proc q*(xml: string): QueryContext =
58+
## Init Q context from XML string
59+
60+
var node = parseXml(xml)
61+
62+
result = initContext(@[node])
63+
64+
65+
proc match(n: XmlNode, s: Selector): bool =
66+
# match tag if tag specified
67+
result = s.tag == "" or s.tag == "*" or n.name == s.tag
68+
69+
if result and s.id != "":
70+
result = n.attr("id") == s.id
71+
72+
if result and s.classes.len > 0:
73+
for class in s.classes:
74+
result = n.attr("class") != "" and class in n.attr("class").split()
75+
76+
if result and s.attributes.len > 0:
77+
for attr in s.attributes:
78+
let value = n.attr(attr.name)
79+
case attr.operator
80+
of '\0':
81+
if attr.value.len == 0: # [attr] match all node has specified attribute, dont care about the value
82+
result = n.attr(attr.name).len > 0
83+
else: # [attr=value] value must match
84+
result = attr.value == value
85+
of '^':
86+
result = value.startsWith(attr.value)
87+
of '$':
88+
result = value.endsWith(attr.value)
89+
of '*':
90+
result = value.contains(attr.value)
91+
else:
92+
result = false
93+
94+
proc searchSimple(parent: XmlNode, selector: Selector, found: var seq[XmlNode]) =
95+
for child in parent.children:
96+
if match(child, selector):
97+
found.add(child)
98+
if selector.combinator != '>':
99+
child.searchSimple(selector, found)
100+
101+
proc searchSimple(parents: var seq[XmlNode], selector: Selector) =
102+
var found: seq[XmlNode] = @[]
103+
for p in parents:
104+
p.searchSimple(selector, found)
105+
106+
parents = found
107+
108+
proc searchCombined(parent: XmlNode, selectors: seq[Selector], found: var seq[XmlNode]) =
109+
var starts: seq[int] = @[0]
110+
var matches: seq[int]
111+
112+
# matching selector by selector
113+
for i in 0..selectors.len-1:
114+
var selector = selectors[i]
115+
matches = @[]
116+
117+
for j in starts:
118+
if parent.children.isNil:
119+
continue
120+
for k in j..parent.children.len-1:
121+
var child = parent.children[k]
122+
123+
if match(child, selector):
124+
if i < selectors.len-1:
125+
# save current index for next search
126+
# next selector will only search for nodes followed by this node
127+
matches.add(k+1)
128+
else:
129+
# no more selector, return matches
130+
if not found.contains(child):
131+
found.add(child)
132+
if selector.combinator == '+':
133+
break
134+
starts = matches
135+
136+
proc searchCombined(parents: var seq[XmlNode], selectors: seq[Selector]) =
137+
var found: seq[XmlNode] = @[]
138+
for p in parents:
139+
p.searchCombined(selectors, found)
140+
141+
parents = found
142+
143+
proc parseSelector(token: string): Selector =
144+
result = newSelector()
145+
# Universal selector
146+
if token == "*":
147+
result.tag = "*"
148+
# Type selector
149+
elif token =~ pselectors:
150+
for i in 0..matches.len-1:
151+
if matches[i].isNil:
152+
continue
153+
154+
let ch = matches[i][0]
155+
case ch:
156+
of '#':
157+
matches[i].delete(0, 0)
158+
result.id = matches[i]
159+
of '.':
160+
matches[i].delete(0, 0)
161+
result.classes.add(matches[i])
162+
of '[':
163+
if matches[i] =~ pattributes:
164+
result.attributes.add(newAttribute(matches[1], matches[2], matches[3]))
165+
else:
166+
result.tag = matches[i]
167+
else:
168+
discard
169+
170+
proc select*(q: QueryContext, s: string = ""): seq[XmlNode] =
171+
## Return list of nodes matched by CSS selector
172+
result = q.root
173+
174+
if s.len == 0:
175+
return result
176+
177+
var nextCombinator, nextToken: string
178+
var tokens = s.split()
179+
var selectors: seq[Selector]
180+
for pos in 0..tokens.len-1:
181+
var isSimple = true
182+
183+
if pos > 0 and (tokens[pos-1] == "+" or tokens[pos-1] == "~"):
184+
continue
185+
186+
if tokens[pos] in [">", "~", "+"]: # ignore combinators
187+
continue
188+
189+
var selector = parseSelector(tokens[pos])
190+
if pos > 0 and tokens[pos-1] == ">":
191+
selector.combinator = '>'
192+
193+
selectors = @[selector]
194+
195+
var i = 1
196+
while true:
197+
if pos + i >= tokens.len:
198+
break
199+
nextCombinator = tokens[pos+i]
200+
# if next token is a sibling combinator
201+
if nextCombinator == "+" or nextCombinator == "~":
202+
if pos + i + 1 >= tokens.len:
203+
raise newException(ValueError, "a selector expected after sibling combinator: " & nextCombinator)
204+
else:
205+
break
206+
207+
isSimple = false
208+
209+
nextToken = tokens[pos+i+1]
210+
i += 2
211+
212+
var tmp = parseSelector(nextToken)
213+
tmp.combinator = nextCombinator[0]
214+
selectors.add(tmp)
215+
216+
if isSimple:
217+
result.searchSimple(selectors[0])
218+
else:
219+
result.searchCombined(selectors)
220+
221+
proc select*(n: XmlNode, s: string = ""): seq[XmlNode] {.inline.} =
222+
q(n).select(s)

tests/test.html

Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
<html>
2+
<head>
3+
<title>Example</title>
4+
<script></script>
5+
</head>
6+
<body>
7+
<nav>
8+
<div class="col-1"></div>
9+
<ul class="menu and">
10+
<li class="dropdown">
11+
<a href="#" id="link1">Link <span>1</span></a>
12+
<ul>
13+
<li>
14+
<a href="#">Link 11</a>
15+
</li>
16+
<li>
17+
<a href="#">Link 12</a>
18+
</li>
19+
</ul>
20+
</li>
21+
<li>
22+
<a href="#">Link 2</a>
23+
</li>
24+
</ul>
25+
<a href="#">Link 100</a>
26+
<div class="col-2"></div>
27+
<a href="#">Link 104</a>
28+
<div class="col-3"></div>
29+
<a href="#">Link 101</a>
30+
<a href="#">Link 102</a>
31+
</nav>
32+
<form class="form-horizontal">
33+
<div class="form-group">
34+
<label for="inputEmail3" class="col-sm-2 control-label">Email</label>
35+
<div class="col-sm-10">
36+
<input type="email" class="form-control" id="inputEmail3" placeholder="Email" />
37+
</div>
38+
</div>
39+
<div class="form-group">
40+
<label for="inputPassword3" class="col-sm-2 control-label">Password</label>
41+
<div class="col-sm-10">
42+
<input type="password" data class="form-control" id="inputPassword3" placeholder="Password" />
43+
</div>
44+
</div>
45+
<div class="form-group">
46+
<div class="col-sm-offset-2 col-sm-10">
47+
<div class="checkbox">
48+
<label>
49+
<input type="checkbox" /> Remember me
50+
</label>
51+
</div>
52+
</div>
53+
</div>
54+
<div class="form-group">
55+
<div class="col-sm-offset-2 col-sm-10">
56+
<button type="submit" class="btn btn-default">Sign in</button>
57+
</div>
58+
</div>
59+
</form>
60+
</body>
61+
</html>

tests/test1.nim

Lines changed: 17 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,20 @@
1-
import pegs, xml
1+
import os, xml, xml/selector
22

3-
const example = """<?xml version="1.0" encoding="UTF-8"?>
4-
<classes>
5-
<class name="Klient">
6-
<attr type="int">id</attr>
7-
<attr type="String">imie</attr>
8-
<attr type="String">nazwisko</attr>
9-
<attr type="Date">dataUr</attr>
10-
</class>
11-
<class name="Wizyta">
12-
<attr type="int">id</attr>
13-
<attr type="Klient">klient</attr>
14-
<attr type="Date">data</attr>
15-
</class>
16-
</classes>
17-
"""
3+
var d = q($readFile(getAppDir() & "/test.html"))
184

19-
var matches: seq[string] = @[]
20-
doAssert(example.match(grammar, matches))
215

22-
for m in matches:
23-
echo m
6+
assert d.select("head *").len == 2
7+
echo d.select("head *")
8+
echo d.select("ul li a")
9+
echo d.select("ul.menu > li a")
10+
echo d.select("ul.menu > li > a")
11+
echo d.select("ul.menu.and > li > a")
12+
echo d.select("#link1")
13+
echo d.select("input[type]")
14+
echo d.select("input[type=password]")
15+
echo d.select("input[type='password']")
16+
echo d.select("input[type=\"password\"]")
17+
echo d.select("input[type^=pa]")
18+
echo d.select("input[type$=ord]")
19+
echo d.select("input[type*=ss]")
20+
echo d.select("nav ul.menu ~ div + a")

0 commit comments

Comments
 (0)