Skip to content

Commit 3ce67d5

Browse files
Python bindings
2 parents b322c19 + ee4a84d commit 3ce67d5

File tree

11 files changed

+355
-4
lines changed

11 files changed

+355
-4
lines changed

CMakeLists.txt

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
11
cmake_minimum_required(VERSION 3.14)
22
project(netspeak4)
33

4+
add_subdirectory(build/dependencies/pybind11-2.9.0)
5+
46
set(CMAKE_CXX_STANDARD 17)
57
set(CMAKE_CXX_STANDARD_REQUIRED ON)
68
set(CMAKE_CXX_EXTENSIONS OFF)
@@ -14,9 +16,11 @@ set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG")
1416
set(CMAKE_CXX_FLAGS_RELEASE "-O3 -DNDEBUG")
1517
set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g")
1618

19+
set(CMAKE_POSITION_INDEPENDENT_CODE ON)
1720

1821
include_directories(
1922
"src"
23+
"build/dependencies/pybind11-2.9.0/include"
2024
"/usr/include/antlr4-runtime"
2125
)
2226

@@ -233,3 +237,49 @@ target_link_libraries(netspeak4 "${NETSPEAK_LINK_LIBS}")
233237
add_executable(netspeak4-test test/netspeak/runner.cpp)
234238
target_sources(netspeak4-test PRIVATE "${NETSPEAK_TEST_SOURCES}")
235239
target_link_libraries(netspeak4-test "${NETSPEAK_TEST_LINK_LIBS}")
240+
241+
set(NETSPEAK_PY_LINK_LIBS
242+
dl
243+
z
244+
rt
245+
246+
# boost
247+
boost_system
248+
boost_regex
249+
250+
# antlr4
251+
antlr4-runtime
252+
)
253+
254+
set(NETSPEAK_PY_SOURCES
255+
"src/py/QueryParser"
256+
257+
"src/antlr4/parse"
258+
"src/antlr4/QueryErrorHandler"
259+
260+
"src/antlr4/generated/QueryBaseListener"
261+
"src/antlr4/generated/QueryBaseVisitor"
262+
"src/antlr4/generated/QueryLexer"
263+
"src/antlr4/generated/QueryListener"
264+
"src/antlr4/generated/QueryParser"
265+
"src/antlr4/generated/QueryVisitor"
266+
267+
"src/netspeak/Dictionaries"
268+
"src/netspeak/error"
269+
"src/netspeak/QueryNormalizer"
270+
271+
"src/netspeak/model/LengthRange"
272+
"src/netspeak/model/NormQuery"
273+
"src/netspeak/model/SimpleQuery"
274+
"src/netspeak/model/Query"
275+
"src/netspeak/model/QuerySyntax"
276+
277+
"src/netspeak/regex/DefaultRegexIndex"
278+
"src/netspeak/regex/parsers"
279+
"src/netspeak/regex/RegexIndex"
280+
"src/netspeak/regex/RegexQuery"
281+
)
282+
283+
pybind11_add_module(netspeak4py src/PythonBindings.cpp "${NETSPEAK_PY_SOURCES}")
284+
set_property(TARGET netspeak4py PROPERTY POSITION_INDEPENDENT_CODE ON)
285+
target_link_libraries(netspeak4py PRIVATE pybind11::module "${NETSPEAK_PY_LINK_LIBS}")

README.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -321,6 +321,23 @@ Run the proxy:
321321
docker run -p 8080:8080 webis/grpcwebproxy:0.14.0 grpcwebproxy --allow_all_origins --backend_addr=host.docker.internal:9000 --backend_tls=false --run_tls_server=false
322322
```
323323

324+
325+
## Python bindings
326+
327+
This project also contains Python 3 bindings using pybind11.
328+
329+
The Python bindings are a shared library (`.so`) and there are some limitations that come from this. The most notable limitation is that the bindings have runtime dependencies (the other Netspeak binaries all statically link their dependencies).
330+
331+
To run the bindings, you need to have boost-regex and boost-system installed. Assuming that you copied `netspeak4py.cpython-38-x86_64-linux-gnu.so` from the builder Docker container, then you also need to copy `/usr/lib/libantlr4-runtime.so` and `/usr/lib/libantlr4-runtime.so.4.7.1`.
332+
333+
To actually import the bindings in Python, make sure pass the directory `netspeak4py...so` is located via the [`PYTHONPATH`](https://docs.python.org/3/using/cmdline.html#envvar-PYTHONPATH) environment variable. If everything is setup correctly, Netspeak's Python bindings can be imported via:
334+
335+
```py
336+
import netspeak4py
337+
```
338+
339+
To see the API of the bindings, run `help(netspeak4py)`.
340+
324341
---
325342

326343
## Contributors

build/install-dependencies.sh

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ fi
1111
cd "$(dirname "$0")"
1212

1313
apt-get update --allow-releaseinfo-change
14-
apt-get install make clang build-essential libboost-filesystem-dev libboost-system-dev libboost-test-dev libboost-program-options-dev libboost-stacktrace-dev libicu-dev libcmph-dev libaio-dev libboost-regex-dev libboost-date-time-dev -y
14+
apt-get install make clang build-essential libboost-filesystem-dev libboost-system-dev libboost-test-dev libboost-program-options-dev libboost-stacktrace-dev libicu-dev libcmph-dev libaio-dev libboost-regex-dev libboost-date-time-dev python3-dev -y
1515

1616

1717
bash ./env/install-antlr4.sh
@@ -22,6 +22,21 @@ mkdir -p ./dependencies
2222
chmod a+rw -R ./dependencies
2323
cd ./dependencies
2424

25+
26+
# Download pybind11
27+
if [ ! -d ./pybind11-2.9.0 ]; then
28+
echo "Downloading pybind11"
29+
30+
apt-get install unzip wget -y
31+
wget -O pybind.zip 'https://github.com/pybind/pybind11/archive/refs/tags/v2.9.0.zip'
32+
unzip pybind.zip
33+
rm pybind.zip
34+
chmod a+rw ./**/*
35+
else
36+
echo "pybind11 already present"
37+
fi
38+
39+
2540
if [[ "$1" != "ci" ]]; then
2641

2742
# Download the JAR necessary to copmile .g4 files

build/run-tests.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -30,3 +30,6 @@ fi
3030

3131
echo "Running test..."
3232
./build/debug/netspeak4-test --build_info=yes --detect_memory_leak=1 --log_level=test_suite
33+
34+
echo "Running python tests..."
35+
PYTHONPATH="$(realpath ./build/debug):$PYTHONPATH" python3 ./test/py/test.py

src/PythonBindings.cpp

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,92 @@
1+
#include <pybind11/pybind11.h>
2+
#include <pybind11/stl.h>
3+
4+
#include <memory>
5+
6+
#include "py/QueryParser.hpp"
7+
8+
using namespace py;
9+
10+
11+
PYBIND11_MODULE(netspeak4py, m) {
12+
m.doc() = "Netspeak4 Python bindings"; // optional module docstring
13+
14+
pybind11::class_<NormQueryUnit> normQueryUnit(m, "NormQueryUnit");
15+
normQueryUnit
16+
.def(pybind11::init<NormQueryUnit::Kind, std::string,
17+
NormQueryUnit::QueryUnitKind>(),
18+
"Create a new norm query unit", pybind11::arg("kind"),
19+
pybind11::arg("text"), pybind11::arg("sourceKind"))
20+
.def("getKind", &NormQueryUnit::get_kind)
21+
.def("getText", &NormQueryUnit::get_text)
22+
.def("getSourceKind", &NormQueryUnit::get_source_kind)
23+
.def("__repr__", [](const NormQueryUnit& u) {
24+
const auto kind = u.get_kind();
25+
const std::string& kind_str = kind == NormQueryUnit::Kind::WORD ? "NormWord"
26+
: kind == NormQueryUnit::Kind::QMARK
27+
? "NormQMark"
28+
: "Invalid";
29+
30+
return "<netspeak4py.NormQueryUnit " + kind_str + " \"" +
31+
(u.get_text()) + "\">";
32+
});
33+
34+
pybind11::enum_<NormQueryUnit::Kind>(normQueryUnit, "Kind")
35+
.value("NormWord", NormQueryUnit::Kind::WORD)
36+
.value("NormQMark", NormQueryUnit::Kind::QMARK)
37+
.export_values();
38+
39+
pybind11::enum_<NormQueryUnit::QueryUnitKind>(normQueryUnit, "QueryUnitKind")
40+
.value("WORD", NormQueryUnit::QueryUnitKind::WORD)
41+
.value("QMARK", NormQueryUnit::QueryUnitKind::QMARK)
42+
.value("STAR", NormQueryUnit::QueryUnitKind::STAR)
43+
.value("PLUS", NormQueryUnit::QueryUnitKind::PLUS)
44+
.value("REGEX", NormQueryUnit::QueryUnitKind::REGEX)
45+
.value("DICTSET", NormQueryUnit::QueryUnitKind::DICTSET)
46+
.value("ORDERSET", NormQueryUnit::QueryUnitKind::ORDERSET)
47+
.value("OPTIONSET", NormQueryUnit::QueryUnitKind::OPTIONSET)
48+
.value("ALTERNATION", NormQueryUnit::QueryUnitKind::ALTERNATION)
49+
.value("CONCAT", NormQueryUnit::QueryUnitKind::CONCAT)
50+
.export_values();
51+
52+
pybind11::class_<NormQuery> normQuery(m, "NormQuery");
53+
normQuery.def(pybind11::init<>())
54+
.def("getUnits", &NormQuery::get_units)
55+
.def("__repr__", [](NormQuery& q) {
56+
std::string s = "<netspeak4py.NormQuery \"";
57+
auto first = true;
58+
for (const auto& u : q.get_units()) {
59+
if (first) {
60+
first = false;
61+
} else {
62+
s.push_back(' ');
63+
}
64+
65+
s.append(u.get_text());
66+
}
67+
68+
s.append("\">");
69+
return s;
70+
});
71+
;
72+
73+
pybind11::class_<QueryParserOptions> queryParserOptions(m,
74+
"QueryParserOptions");
75+
queryParserOptions.def(pybind11::init<>())
76+
.def_readwrite("maxNormQueries", &QueryParserOptions::max_norm_queries)
77+
.def_readwrite("minLength", &QueryParserOptions::min_length)
78+
.def_readwrite("maxLength", &QueryParserOptions::max_length)
79+
.def_readwrite("maxRegexMatches", &QueryParserOptions::max_regex_matches)
80+
.def_readwrite("maxRegexTimeMs", &QueryParserOptions::max_regex_time_ms);
81+
82+
pybind11::class_<QueryParser> queryParser(m, "QueryParser");
83+
queryParser
84+
.def(pybind11::init<const std::string&, const std::string&, bool>(),
85+
"Creates a new query parser.", pybind11::arg("vocab") = "",
86+
pybind11::arg("dictionaryCsv") = "",
87+
pybind11::arg("lowerCase") = false)
88+
.def("parse", &QueryParser::parse,
89+
"Parses the given Netspeak 4 query and returns a list of norm "
90+
"queries.",
91+
pybind11::arg("query"), pybind11::arg("options"));
92+
}

src/netspeak/Dictionaries.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,16 @@ const Dictionaries::Map Dictionaries::read_from_file(
1515
const boost::filesystem::path& csv) {
1616
boost::filesystem::ifstream ifs(csv);
1717
util::check(ifs.is_open(), error_message::cannot_open, csv);
18+
return parse_csv(ifs);
19+
}
20+
21+
const Dictionaries::Map Dictionaries::parse_csv(
22+
std::basic_istream<char, std::char_traits<char>>& stream) {
1823
Map dict;
1924
std::string line;
2025
std::vector<std::string> tokens;
2126
const auto predicate = std::bind2nd(std::equal_to<char>(), '\t');
22-
while (std::getline(ifs, line)) {
27+
while (std::getline(stream, line)) {
2328
boost::split(tokens, line, predicate);
2429
if (tokens.size() < 2)
2530
continue;

src/netspeak/Dictionaries.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ class Dictionaries {
2222
typedef std::unordered_multimap<std::string, std::string> Map;
2323

2424
static const Map read_from_file(const boost::filesystem::path& csv);
25+
static const Map parse_csv(
26+
std::basic_istream<char, std::char_traits<char>>& stream);
2527
};
2628

2729
} // namespace netspeak

src/netspeak/model/QuerySyntax.hpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,6 @@
44
#include <ostream>
55
#include <string>
66

7-
#include "netspeak/value/pair.hpp"
8-
97
namespace netspeak {
108
namespace model {
119

src/py/QueryParser.cpp

Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
#include "py/QueryParser.hpp"
2+
3+
#include <sstream>
4+
5+
#include "antlr4/parse.hpp"
6+
7+
#include "netspeak/Dictionaries.hpp"
8+
#include "netspeak/regex/DefaultRegexIndex.hpp"
9+
#include "netspeak/regex/RegexIndex.hpp"
10+
11+
namespace py {
12+
13+
NormQueryUnit::NormQueryUnit(NormQueryUnit::Kind kind, std::string text,
14+
QueryUnitKind source_kind)
15+
: kind_(kind), text_(text), source_kind_(source_kind) {}
16+
17+
NormQueryUnit::Kind NormQueryUnit::get_kind() const {
18+
return kind_;
19+
}
20+
const std::string& NormQueryUnit::get_text() const {
21+
return text_;
22+
}
23+
NormQueryUnit::QueryUnitKind NormQueryUnit::get_source_kind() const {
24+
return source_kind_;
25+
}
26+
27+
28+
std::vector<NormQueryUnit>& NormQuery::get_units() {
29+
return units_;
30+
}
31+
32+
33+
netspeak::QueryNormalizer::InitConfig get_init_config(
34+
const std::string& vocab, const std::string& dictionary_csv,
35+
bool lower_case) {
36+
std::shared_ptr<netspeak::regex::RegexIndex> regex_index = nullptr;
37+
if (!vocab.empty()) {
38+
regex_index = std::make_shared<netspeak::regex::DefaultRegexIndex>(vocab);
39+
}
40+
41+
std::shared_ptr<netspeak::Dictionaries::Map> dictionary = nullptr;
42+
if (!dictionary_csv.empty()) {
43+
std::istringstream iss(dictionary_csv);
44+
dictionary = std::make_shared<netspeak::Dictionaries::Map>(
45+
std::move(netspeak::Dictionaries::parse_csv(iss)));
46+
}
47+
48+
return {
49+
.regex_index = regex_index,
50+
.dictionary = dictionary,
51+
.lower_case = lower_case,
52+
};
53+
}
54+
QueryParser::QueryParser(const std::string& vocab,
55+
const std::string& dictionary_csv, bool lower_case)
56+
: normalizer(get_init_config(vocab, dictionary_csv, lower_case)) {}
57+
58+
NormQuery convert_norm_query(netspeak::model::NormQuery input) {
59+
NormQuery result;
60+
for (const auto& unit : input.units()) {
61+
NormQueryUnit::Kind kind = unit.tag();
62+
std::string text = *unit.text();
63+
NormQueryUnit::QueryUnitKind source_kind = unit.source().unit->tag();
64+
65+
result.get_units().push_back(NormQueryUnit(kind, text, source_kind));
66+
}
67+
return result;
68+
}
69+
std::vector<NormQuery> QueryParser::parse(const std::string& query,
70+
QueryParserOptions options) {
71+
const auto parsed_query = antlr4::parse_query(query);
72+
std::vector<netspeak::model::NormQuery> norm_queries;
73+
normalizer.normalize(parsed_query,
74+
{
75+
.max_norm_queries = options.max_norm_queries,
76+
.min_length = options.min_length,
77+
.max_length = options.max_length,
78+
.max_regex_matches = options.max_regex_matches,
79+
.max_regex_time = std::chrono::milliseconds(
80+
options.max_regex_time_ms),
81+
},
82+
norm_queries);
83+
84+
// convert norm query types
85+
std::vector<NormQuery> result;
86+
for (const auto& nq : norm_queries) {
87+
result.push_back(convert_norm_query(nq));
88+
}
89+
90+
return result;
91+
}
92+
93+
} // namespace py

0 commit comments

Comments
 (0)