diff --git a/.gitignore b/.gitignore index 51f3eee22..af22edb51 100644 --- a/.gitignore +++ b/.gitignore @@ -82,6 +82,8 @@ tuning_list etc/tune 2kprime.1 drprimes.txt +etc/multiplying* +etc/squaring* # ignore stuff generated by "make manual" and "make poster" *.aux @@ -134,3 +136,20 @@ build*/ # kdevelop section .kdev4/ *.kdev4 + +# ignore cmake files +CMakeFiles +Makefile +cmake_install.cmake + + + + + + + + + + + + diff --git a/CMakeLists.txt b/CMakeLists.txt index 014fb1883..2e4e649e2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,8 @@ include(sources.cmake) # Options #----------------------------------------------------------------------------- option(BUILD_SHARED_LIBS "Build shared library and only the shared library if \"ON\", default is static" OFF) - +option(BUILD_TUNING "Run a tuning program for the fast multiplication/squaring algorithms if \"ON\"" OFF) +option(BUILD_GRAPHS "Run a benchmark of the fast multiplication/squaring algorithms and make graphics if \"ON\"" OFF) #----------------------------------------------------------------------------- # Compose CFLAGS #----------------------------------------------------------------------------- @@ -137,6 +138,14 @@ if(BUILD_TESTING) add_subdirectory(demo) endif() +#----------------------------------------------------------------------------- +# tuning and benchmark targets +#----------------------------------------------------------------------------- + +if(BUILD_TUNING OR BUILD_GRAPHS) + add_subdirectory(etc ${CMAKE_CURRENT_SOURCE_DIR}/etc) +endif() + #----------------------------------------------------------------------------- # Install/export targets and files #----------------------------------------------------------------------------- diff --git a/demo/timing.c b/demo/timing.c index a66ce8505..ce33d3a23 100644 --- a/demo/timing.c +++ b/demo/timing.c @@ -55,42 +55,35 @@ static unsigned int lbit(void) } } -/* RDTSC from Scott Duplichan */ -static uint64_t TIMFUNC(void) -{ -#if defined __GNUC__ -#if defined(__i386__) || defined(__x86_64__) - /* version from http://www.mcs.anl.gov/~kazutomo/rdtsc.html - * the old code always got a warning issued by gcc, clang did not complain... - */ - unsigned hi, lo; - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return ((uint64_t)lo)|(((uint64_t)hi)<<32); -#else /* gcc-IA64 version */ - unsigned long result; - __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory"); - - while (__builtin_expect((int) result == -1, 0)) - __asm__ __volatile__("mov %0=ar.itc":"=r"(result)::"memory"); - - return result; -#endif - /* Microsoft and Intel Windows compilers */ -#elif defined _M_IX86 - __asm rdtsc -#elif defined _M_AMD64 - return __rdtsc(); -#elif defined _M_IA64 -#if defined __INTEL_COMPILER -#include +#if defined(_WIN32) +# include #endif - return __getReg(3116); + +static uint64_t TIMFUNC(void) +{ +#if _POSIX_C_SOURCE >= 199309L +#define LTM_BILLION 1000000000 + struct timespec ts; + + /* TODO: Sets errno in case of error. Use? */ + clock_gettime(CLOCK_MONOTONIC, &ts); + return (((uint64_t)ts.tv_sec) * LTM_BILLION + (uint64_t)ts.tv_nsec); +#elif defined(_WIN32) + LARGE_INTEGER ticks; + QueryPerformanceCounter(&ticks); + return (uint64_t)ticks.QuadPart; #else -#error need rdtsc function for this build + clock_t t; + t = clock(); + if (t < (clock_t)(0)) { + return (uint64_t)(0); + } + return (uint64_t)(t); #endif } + #define DO2(x) do { mp_err err = x; err = x; (void)err; }while(0) #define DO4(x) DO2(x); DO2(x) #define DO8(x) DO4(x); DO4(x) @@ -141,6 +134,12 @@ int main(int argc, char **argv) int n, cnt, ix, old_kara_m, old_kara_s, old_toom_m, old_toom_s; unsigned rr; +#ifdef _WIN32 + LARGE_INTEGER Frequency; +#else + struct timespec ts; +#endif + CHECK_OK(mp_init(&a)); CHECK_OK(mp_init(&b)); CHECK_OK(mp_init(&c)); @@ -150,10 +149,21 @@ int main(int argc, char **argv) srand(LTM_TIMING_RAND_SEED); - +#ifdef _WIN32 + QueryPerformanceFrequency(&Frequency); + CLK_PER_SEC = (uint64) Frequency; +#elif _POSIX_C_SOURCE >= 199309L + /* returns -1 for an error and 0 for okay, sets errno (not used here) */ + if (clock_getres(CLOCK_MONOTONIC, &ts)) { + fprintf(stderr, "%d, clock_getres failed\n", __LINE__); + exit(EXIT_FAILURE); + } + CLK_PER_SEC = LTM_BILLION / ts.tv_nsec; +#else CLK_PER_SEC = TIMFUNC(); sleep(1); CLK_PER_SEC = TIMFUNC() - CLK_PER_SEC; +#endif printf("CLK_PER_SEC == %" PRIu64 "\n", CLK_PER_SEC); diff --git a/doc/bn.tex b/doc/bn.tex index 63e71633b..dfedc8295 100644 --- a/doc/bn.tex +++ b/doc/bn.tex @@ -102,6 +102,7 @@ \section{License} \section{Building LibTomMath} + LibTomMath is meant to be very ``GCC friendly'' as it comes with a makefile well suited for GCC. However, the library will also build in MSVC, Borland C out of the box. For any other ISO C compiler a makefile will have to be made by the end @@ -270,6 +271,53 @@ \subsection{Testing} test was invoked. If an error is detected the program will exit with a dump of the relevant numbers it was working with. +\subsection{CMake} +Some of the options above are also available with CMake. + +\subsubsection{Shared Library} +The default is a static library. To produce a shared library use the CMake option +\begin{alltt} +-DBUILD_SHARED_LIBS=ON +\end{alltt} + +\subsubsection{Testing} +To run the testsuite use option +\begin{alltt} +-DBUILD_TESTING=ON +\end{alltt} + +\subsubsection{Tuning} +To run the tuning itself use option +\begin{alltt} +-DBUILD_TUNING=ON +\end{alltt} + +To run a benchmark with the tuned library and print plots of the benchmark tables use option +\begin{alltt} +-DBUILD_GRAPHS=ON +\end{alltt} + +To compile with LTO (Link Time Optimization) use option +\begin{alltt} +-DCOMPILE_LTO=ON +\end{alltt} + +There are several build types available: +\begin{description} +\item[Debug] Build a library with debugging symbols (\texttt{-g3}) and no extra optimization +\item[Release] Build the normal release version (\texttt{-O3 -funroll-loops -fomit-frame-pointer}) (default) +\item[RelWithDebInfo] Build a library with debugging symbols (\texttt{-g3 -O2}) and a bit of optimization +\item[MinSizeRel] Build a small sized library (\texttt{-Os}) +\end{description} +The build types are case-sensitive! + +Choose one with: +\begin{alltt} +-DCMAKE_BUILD_TYPE=buildtype +\end{alltt} + + + \section{Build Configuration} LibTomMath can configured at build time in two phases we shall call ``depends'' and ``trims''. Each phase changes how the library is built and they are applied one after another @@ -1600,13 +1648,35 @@ \section{Tuning Polynomial Basis Routines} make tune \end{alltt} -This will run a benchmark, computes the medians, rewrites \texttt{bncore.c}, and recompiles -\texttt{bncore.c} and relinks the library. +With CMake +\begin{alltt} +cmake --build /path/to/build/dir -DBUILD_TUNING=ON +\end{alltt} + + +This will run a benchmark, computes the medians, rewrites \texttt{tommath\_cutoffs.h}, recompiles +\texttt{cutoffs.c}, and relinks the library. The benchmark itself can be fine--tuned in the file \texttt{etc/tune\_it.sh}. The program \texttt{etc/tune} is also able to print a list of values for printing curves with e.g.: -\texttt{gnuplot}. type \texttt{./etc/tune -h} to get a list of all available options. +\texttt{gnuplot}. Type \texttt{./etc/tune -h} to get a list of all the available options. There +are a lot. + +To get some nice plots in \texttt{etc} try + +\begin{alltt} +make graphs +\end{alltt} + +With CMake +\begin{alltt} +cmake --build /path/to/build/dir -DBUILD_GRAPHS=ON +\end{alltt} + +This will run a benchmark, computes the medians, rewrites \texttt{tommath\_cutoffs.h}, recompiles +\texttt{cutoffs.c}, relinks the library and runs gnuplot to print plots in the PNG format. The size +of the images is fixed in the file \texttt{etc/plot\_graphs.gp} and has to be changed manually. \chapter{Modular Reduction} diff --git a/etc/CMakeLists.txt b/etc/CMakeLists.txt new file mode 100644 index 000000000..0d6bc1c2c --- /dev/null +++ b/etc/CMakeLists.txt @@ -0,0 +1,51 @@ +# SPDX-License-Identifier: Unlicense +# +# LibTomMath, a free open source portable number theoretic multiple-precision +# integer (MPI) library written entirely in C. +# + +cmake_minimum_required(VERSION 3.10) + +set(LTM_TUNE tune-ltm) + +# This file can be included from the top level or used stand-alone +if(PROJECT_NAME) + set(LIBRARY_NAME ${PROJECT_NAME}) +else() + # Define an independent project and all the necessary stuff around + project(${LTM_TUNE} + LANGUAGES C) + set(LIBRARY_NAME libtommath) + find_package(${LIBRARY_NAME}) + if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) + set(CMAKE_BUILD_TYPE "Release") + endif() +endif() + +add_executable(tune + ${CMAKE_CURRENT_SOURCE_DIR}/tune.c +) + +target_include_directories(tune PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + ${CMAKE_CURRENT_SOURCE_DIR}/.. +) + +target_link_libraries(tune PRIVATE + ${LIBRARY_NAME} +) + +target_compile_options(tune PRIVATE + ${LTM_C_FLAGS} +) +target_link_options(tune BEFORE PUBLIC + ${LTM_LD_FLAGS} +) + +if(BUILD_GRAPHS) + # used in tune_it.sh + find_program(GNUPLOT gnuplot) + add_custom_command(TARGET tune POST_BUILD COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/tune_it.sh 1000 WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} VERBATIM) +else() + add_custom_command(TARGET tune POST_BUILD COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/tune_it.sh WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} VERBATIM) +endif() diff --git a/etc/makefile b/etc/makefile index 0e178396f..ecdf9b1f7 100644 --- a/etc/makefile +++ b/etc/makefile @@ -6,7 +6,7 @@ LTM_TUNE_CFLAGS = $(CFLAGS) $(LTM_CFLAGS) -Wall -W -Wextra -Wshadow -O3 -I../ # libname when you can't install the lib with install LIBNAME=../libtommath.a -all: pprime tune test_standalone mersenne drprime 2kprime mont +all: pprime tune test_standalone mersenne drprime 2kprime mont getlimbsize graph #provable primes pprime: pprime.o @@ -36,10 +36,15 @@ drprime: drprime.o mont: mont.o $(CC) $(LTM_TUNE_CFLAGS) mont.o $(LIBNAME) -o mont +# Make pretty pictures (1000 is the maximum number of limbs to print for mul/sqr) +# "tune" runs twice because it runs automatically when build. +graphs: tune + ./tune_it.sh 1000 clean: rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime mont 2kprime pprime.dat \ - tuning_list multiplying squaring test *.da *.dyn *.dpi *~ + tuning_list get_limbsize out *.da *.dyn *.dpi *~ cmake_install.cmake Makefile rm -rf .libs + rm -rf CMakeFiles .PHONY: tune diff --git a/etc/makefile.icc b/etc/makefile.icc index 9217f7b1d..2a7272a1e 100644 --- a/etc/makefile.icc +++ b/etc/makefile.icc @@ -32,20 +32,10 @@ tune: tune.o $(CC) $(CFLAGS) tune.o $(LIBNAME) -o tune ./tune_it.sh -# same app but using RDTSC for higher precision [requires 80586+], coff based gcc installs [e.g. ming, cygwin, djgpp] -tune86: tune.c - nasm -f coff timer.asm - $(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86 - -# for cygwin -tune86c: tune.c - nasm -f gnuwin32 timer.asm - $(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86 - -#make tune86 for linux or any ELF format -tune86l: tune.c - nasm -f elf -DUSE_ELF timer.asm - $(CC) -DX86_TIMER $(CFLAGS) tune.c timer.o $(LIBNAME) -o tune86l +# Make pretty pictures (1000 is the maximum number of limbs to print for mul/sqr) +# "tune" runs twice because it runs automatically when build. +graphs: tune + ./tune_it.sh 1000 # spits out mersenne primes mersenne: mersenne.o @@ -64,4 +54,4 @@ mont: mont.o clean: - rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime tune86 tune86l mont 2kprime pprime.dat *.il tuning_list + rm -f *.log *.o *.obj *.exe pprime tune mersenne drprime mont 2kprime pprime.dat get_limbsize *.il tuning_list diff --git a/etc/plot_graphs.gp b/etc/plot_graphs.gp new file mode 100644 index 000000000..c78511d1c --- /dev/null +++ b/etc/plot_graphs.gp @@ -0,0 +1,19 @@ +set term pngcairo size 720,540 +# Good for most colorblinds +set colorsequence podo + +set key top left; + +set ylabel "Time" +set xlabel "Operand size (limbs)" + +set output "multiplying".ARG1.".png"; +set title "Comparing fast and slow multiplying [".ARG1." bits limbsize]"; +plot "multiplying".ARG1."" using 1:2 w lines t "slow", "multiplying".ARG1."" using 1:3 w lines t "fast" + +set output "squaring".ARG1.".png"; +set title "Comparing fast and slow squaring [".ARG1." bits limbsize]"; +plot "squaring".ARG1."" using 1:2 w lines t "slow", "squaring".ARG1."" using 1:3 w lines t "fast" + + + diff --git a/etc/timer.asm b/etc/timer.asm deleted file mode 100644 index 35890d985..000000000 --- a/etc/timer.asm +++ /dev/null @@ -1,37 +0,0 @@ -; x86 timer in NASM -; -; Tom St Denis, tomstdenis@iahu.ca -[bits 32] -[section .data] -time dd 0, 0 - -[section .text] - -%ifdef USE_ELF -[global t_start] -t_start: -%else -[global _t_start] -_t_start: -%endif - push edx - push eax - rdtsc - mov [time+0],edx - mov [time+4],eax - pop eax - pop edx - ret - -%ifdef USE_ELF -[global t_read] -t_read: -%else -[global _t_read] -_t_read: -%endif - rdtsc - sub eax,[time+4] - sbb edx,[time+0] - ret - \ No newline at end of file diff --git a/etc/tune.c b/etc/tune.c index 8ad202890..2f78da9cc 100644 --- a/etc/tune.c +++ b/etc/tune.c @@ -24,6 +24,9 @@ static uint64_t s_time_mul(int size); static uint64_t s_time_sqr(int size); static void s_usage(char *s); +#if defined(_WIN32) +# include +#endif static uint64_t s_timer_function(void) { #if _POSIX_C_SOURCE >= 199309L @@ -33,6 +36,10 @@ static uint64_t s_timer_function(void) /* TODO: Sets errno in case of error. Use? */ clock_gettime(CLOCK_MONOTONIC, &ts); return (((uint64_t)ts.tv_sec) * LTM_BILLION + (uint64_t)ts.tv_nsec); +#elif defined(_WIN32) + LARGE_INTEGER ticks; + QueryPerformanceCounter(&ticks); + return (uint64_t)ticks.QuadPart; #else clock_t t; t = clock(); @@ -42,7 +49,6 @@ static uint64_t s_timer_function(void) return (uint64_t)(t); #endif } - /* generic ISO C timer */ static uint64_t s_timer_tmp; static void s_timer_start(void) @@ -66,7 +72,7 @@ static uint64_t s_time_mul(int size) int x; mp_err e; mp_int a, b, c, d; - uint64_t t1; + uint64_t t1 = 0u; if ((e = mp_init_multi(&a, &b, &c, &d, NULL)) != MP_OKAY) { t1 = UINT64_MAX; @@ -82,12 +88,14 @@ static uint64_t s_time_mul(int size) goto LBL_ERR; } - s_timer_start(); + for (x = 0; x < s_number_of_test_loops; x++) { + s_timer_start(); if ((e = mp_mul(&a,&b,&c)) != MP_OKAY) { t1 = UINT64_MAX; goto LBL_ERR; } + t1 += s_timer_stop(); if (s_check_result == 1) { if ((e = s_mp_mul_full(&a,&b,&d)) != MP_OKAY) { t1 = UINT64_MAX; @@ -101,7 +109,7 @@ static uint64_t s_time_mul(int size) } } - t1 = s_timer_stop(); + LBL_ERR: mp_clear_multi(&a, &b, &c, &d, NULL); return t1; @@ -112,7 +120,7 @@ static uint64_t s_time_sqr(int size) int x; mp_err e; mp_int a, b, c; - uint64_t t1; + uint64_t t1 = 0u; if ((e = mp_init_multi(&a, &b, &c, NULL)) != MP_OKAY) { t1 = UINT64_MAX; @@ -124,12 +132,14 @@ static uint64_t s_time_sqr(int size) goto LBL_ERR; } - s_timer_start(); + for (x = 0; x < s_number_of_test_loops; x++) { + s_timer_start(); if ((e = mp_sqr(&a,&b)) != MP_OKAY) { t1 = UINT64_MAX; goto LBL_ERR; } + t1 += s_timer_stop(); if (s_check_result == 1) { if ((e = s_mp_sqr(&a,&c)) != MP_OKAY) { t1 = UINT64_MAX; @@ -142,7 +152,7 @@ static uint64_t s_time_sqr(int size) } } - t1 = s_timer_stop(); + LBL_ERR: mp_clear_multi(&a, &b, &c, NULL); return t1; @@ -288,7 +298,7 @@ int main(int argc, char **argv) int opt; struct cutoffs orig, updated; - FILE *squaring, *multiplying; + FILE *squaring, *multiplying, *out; char mullog[256] = "multiplying"; char sqrlog[256] = "squaring"; s_number_of_test_loops = 64; @@ -309,6 +319,14 @@ int main(int argc, char **argv) if (argc != 1) { for (opt = 1; (opt < argc) && (argv[opt][0] == '-'); opt++) { switch (argv[opt][1]) { + case 'Z': + out = fopen("out", "w"); + if (out == NULL) { + fprintf(stderr, "Opening file \"%s\" failed\n", "out"); + exit(EXIT_FAILURE); + } + fprintf(out,"%d",MP_DIGIT_BIT); + exit(EXIT_SUCCESS); case 'T': args.testmode = 1; s_check_result = 1; @@ -433,6 +451,7 @@ int main(int argc, char **argv) } } + /* mp_rand uses the cryptographically secure source of the OS by default. That is too expensive, too slow and @@ -469,7 +488,6 @@ int main(int argc, char **argv) if (test[n].fn != NULL) { s_run(test[n].name, test[n].fn, test[n].cutoff); *test[n].update = *test[n].cutoff; - *test[n].cutoff = INT_MAX; } } } @@ -479,14 +497,10 @@ int main(int argc, char **argv) updated.SQR_KARATSUBA, updated.MUL_TOOM, updated.SQR_TOOM); - } else { - printf("MUL_KARATSUBA_CUTOFF = %d\n", updated.MUL_KARATSUBA); - printf("SQR_KARATSUBA_CUTOFF = %d\n", updated.SQR_KARATSUBA); - printf("MUL_TOOM_CUTOFF = %d\n", updated.MUL_TOOM); - printf("SQR_TOOM_CUTOFF = %d\n", updated.SQR_TOOM); } if (args.print == 1) { + printf("Printing data for graphing to \"%s\" and \"%s\"\n",mullog, sqrlog); multiplying = fopen(mullog, "w+"); diff --git a/etc/tune_it.sh b/etc/tune_it.sh index dba5b6968..2666080bd 100755 --- a/etc/tune_it.sh +++ b/etc/tune_it.sh @@ -61,7 +61,7 @@ i=1 while [ $i -le $LIMIT ]; do RNUM=$(LCG) printf "\r%d" $i - "$MPWD"/tune -t -r $RLOOPS -L $LAG -S "$RNUM" -o $OFFSET >> $FILE_NAME || die "tune" $? + "$MPWD"/tune -t -r $RLOOPS -L $LAG -S "$RNUM" -o $OFFSET >> $FILE_NAME || die "tune" $? i=$((i + 1)) done @@ -92,15 +92,29 @@ END_OF_INPUT i=$(tail -n +2 $FILE_NAME | wc -l) # our median point will be at $i entries i=$(( (i / 2) + 1 )) -TMP=$(median $FILE_NAME 1 $i) -echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP" -echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(km) Appending to $TOMMATH_CUTOFFS_H" $? -TMP=$(median $FILE_NAME 2 $i) -echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP" -echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(ks) Appending to $TOMMATH_CUTOFFS_H" $? -TMP=$(median $FILE_NAME 3 $i) -echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF $TMP" -echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3m) Appending to $TOMMATH_CUTOFFS_H" $? -TMP=$(median $FILE_NAME 4 $i) -echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF $TMP" -echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF $TMP" >> $TOMMATH_CUTOFFS_H || die "(tc3s) Appending to $TOMMATH_CUTOFFS_H" $? +TMP1=$(median $FILE_NAME 1 $i) +echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP1" +echo "#define MP_DEFAULT_MUL_KARATSUBA_CUTOFF $TMP1" >> $TOMMATH_CUTOFFS_H || die "(km) Appending to $TOMMATH_CUTOFFS_H" $? +TMP2=$(median $FILE_NAME 2 $i) +echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP2" +echo "#define MP_DEFAULT_SQR_KARATSUBA_CUTOFF $TMP2" >> $TOMMATH_CUTOFFS_H || die "(ks) Appending to $TOMMATH_CUTOFFS_H" $? +TMP3=$(median $FILE_NAME 3 $i) +echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF $TMP3" +echo "#define MP_DEFAULT_MUL_TOOM_CUTOFF $TMP3" >> $TOMMATH_CUTOFFS_H || die "(tc3m) Appending to $TOMMATH_CUTOFFS_H" $? +TMP4=$(median $FILE_NAME 4 $i) +echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF $TMP4" +echo "#define MP_DEFAULT_SQR_TOOM_CUTOFF $TMP4" >> $TOMMATH_CUTOFFS_H || die "(tc3s) Appending to $TOMMATH_CUTOFFS_H" $? + +# Print the tables for the graphs. Please do not change. +if [ $# -eq 1 ]; then + OPTION0=$1 + # Do not forget to raise if there are more fast algorithms with higher cutoffs. + if [ $OPTION0 -lt 500 ]; then + OPTION0=500 + fi + "$MPWD"/tune -Z + "$MPWD"/tune -p -r $RLOOPS -L $LAG -S "$RNUM" -o $OFFSET -M $1 -G $(cat out) -s $TMP1,$TMP2,$TMP3,$TMP4 + gnuplot -c plot_graphs.gp $(cat out) +fi + + diff --git a/logs/Makefile b/logs/Makefile index 59987e519..0d73db987 100644 --- a/logs/Makefile +++ b/logs/Makefile @@ -6,4 +6,4 @@ cmp: gnuplot before_after.dem clean: - rm -f *-*.log *.png graphs-*.dem + rm -f *.log *.png graphs-*.dem diff --git a/logs/before_after.dem b/logs/before_after.dem index 33721ad20..3d7b9edee 100644 --- a/logs/before_after.dem +++ b/logs/before_after.dem @@ -1,26 +1,34 @@ set terminal png +# Good for most colorblinds +set colorsequence podo +set key top left; + set ylabel "Cycles per Operation" set xlabel "Operand size (bits)" -set output "addsub-ba.png" +set output "addsub-ba-".ARG1.".png" +set title "Addition/subtraction before and after profiling. Version: ".ARG1 plot 'add-before.log' smooth bezier title "Addition (before)", \ 'add-after.log' smooth bezier title "Addition (after)", \ 'sub-before.log' smooth bezier title "Subtraction (before)", \ 'sub-after.log' smooth bezier title "Subtraction (after)" -set output "mult-ba.png" +set output "mult-ba-".ARG1.".png" +set title "Multiplication before and after profiling. Version: ".ARG1 plot 'mult-before.log' smooth bezier title "Multiplication (without Karatsuba) (before)", \ 'mult-after.log' smooth bezier title "Multiplication (without Karatsuba) (after)", \ 'mult_kara-before.log' smooth bezier title "Multiplication (Karatsuba) (before)", \ 'mult_kara-after.log' smooth bezier title "Multiplication (Karatsuba) (after)" -set output "sqr-ba.png" +set output "sqr-ba-".ARG1.".png" +set title "Squaring before and after profiling. Version: ".ARG1 plot 'sqr-before.log' smooth bezier title "Squaring (without Karatsuba) (before)", \ 'sqr-after.log' smooth bezier title "Squaring (without Karatsuba) (after)", \ 'sqr_kara-before.log' smooth bezier title "Squaring (Karatsuba) (before)", \ 'sqr_kara-after.log' smooth bezier title "Squaring (Karatsuba) (after)" -set output "expt-ba.png" +set output "expt-ba-".ARG1.".png" +set title "Exponentiation algorithms before and after profiling. Version: ".ARG1 plot 'expt-before.log' smooth bezier title "Exptmod (Montgomery) (before)", \ 'expt-after.log' smooth bezier title "Exptmod (Montgomery) (after)", \ 'expt_dr-before.log' smooth bezier title "Exptmod (Diminished Radix) (before)", \ @@ -30,7 +38,8 @@ plot 'expt-before.log' smooth bezier title "Exptmod (Montgomery) (before)", \ 'expt_2kl-before.log' smooth bezier title "Exptmod (2k-l Reduction) (before)", \ 'expt_2kl-after.log' smooth bezier title "Exptmod (2k-l Reduction) (after)" -set output "invmod-ba.png" +set output "invmod-ba-".ARG1.".png" +set title "Invmod algorithms before and after profiling. Version: ".ARG1 plot 'invmod-before.log' smooth bezier title "Modular Inverse (before)", \ 'invmod-after.log' smooth bezier title "Modular Inverse (after)" diff --git a/makefile b/makefile index 8f211f5f2..e112fb2ff 100644 --- a/makefile +++ b/makefile @@ -69,17 +69,62 @@ profiled: rm -f *.a *.o timing make CFLAGS="$(CFLAGS) -fbranch-probabilities" +# run tune first, than optimize branching +profiled_tuned: tune + make CFLAGS="$(CFLAGS) -fprofile-arcs" timing + ./timing + rm -f *.a *.o timing + make CFLAGS="$(CFLAGS) -fbranch-probabilities" + +# run tune first, optimize branching, run tune again +# (running it in a loop until the timings stabilize is indeed tempting) +profiled_tuned_tuned: tune + make CFLAGS="$(CFLAGS) -fprofile-arcs" timing + ./timing + rm -f *.a *.o timing + make CFLAGS="$(CFLAGS) -fbranch-probabilities" + ./etc/tune_it.sh + #make a single object profiled library amalgamated_timing: pre_gen $(CC) $(LTM_CFLAGS) -fprofile-arcs -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o $(CC) $(LTM_CFLAGS) -DMP_VERSION=\"before\" demo/timing.c tommath_amalgam.o -lgcov -o timing + +amalgamated_timing_tuned: tune pre_gen + $(CC) $(LTM_CFLAGS) -fprofile-arcs -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o + $(CC) $(LTM_CFLAGS) -DMP_VERSION=\"before\" demo/timing.c tommath_amalgam.o -lgcov -o timing + profiled_single: amalgamated_timing ./timing rm -f *.o timing $(CC) $(LTM_CFLAGS) -fbranch-probabilities -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o $(AR) $(ARFLAGS) $(LIBNAME) tommath_amalgam.o +# run tune first, than optimize branching +profiled_single_tuned: amalgamated_timing_tuned + ./timing + rm -f *.a *.o timing + $(CC) $(LTM_CFLAGS) -fbranch-probabilities -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o + $(AR) $(ARFLAGS) $(LIBNAME) tommath_amalgam.o + +# run tune first, optimize branching, run tune again +# (running it in a loop until the timings stabilize is indeed tempting) +profiled_single_tuned_tuned: amalgamated_timing_tuned + ./timing + rm -f *.o timing + $(CC) $(LTM_CFLAGS) -fbranch-probabilities -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o + $(AR) $(ARFLAGS) $(LIBNAME) tommath_amalgam.o + rm -f etc/tune.o + $(CC) $(LTM_CFLAGS) -c etc/tune.c -o etc/tune.o + $(CC) $(LTM_CFLAGS) tommath_amalgam.o etc/tune.o -o etc/tune + ./etc/tune_it.sh + rm -f *.a *.o + $(CC) $(LTM_CFLAGS) -fprofile-arcs -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o + $(CC) $(LTM_CFLAGS) -fbranch-probabilities -c pre_gen/tommath_amalgam.c -o tommath_amalgam.o + $(AR) $(ARFLAGS) $(LIBNAME) tommath_amalgam.o + + install: $(LIBNAME) .install_common install -m 644 $(LIBNAME) $(DESTDIR)$(LIBPATH) install -m 644 $(HEADERS_PUB) $(DESTDIR)$(INCPATH) @@ -108,6 +153,10 @@ tune: $(LIBNAME) $(MAKE) -C etc tune CFLAGS="$(LTM_CFLAGS) -I../" $(MAKE) +graphs: $(LIBNAME) + $(MAKE) -C etc graphs CFLAGS="$(LTM_CFLAGS) -I../" + $(MAKE) + etc-all: $(LIBNAME) $(MAKE) -C etc all CFLAGS="$(LTM_CFLAGS) -I../" $(MAKE) @@ -127,6 +176,16 @@ cmp: profiled_single ./timing $(MAKE) -C logs/ cmp +cmp_tuned: profiled_single_tuned + $(CC) $(LTM_CFLAGS) -DMP_VERSION=\"after\" demo/timing.c $(LIBNAME) -lgcov -o timing + ./timing + $(MAKE) -C logs/ cmp + +cmp_tuned_tuned: profiled_single_tuned_tuned + $(CC) $(LTM_CFLAGS) -DMP_VERSION=\"after\" demo/timing.c $(LIBNAME) -lgcov -o timing + ./timing + $(MAKE) -C logs/ cmp + zipup: $(MAKE) clean $(MAKE) .zipup diff --git a/makefile.mingw b/makefile.mingw index e2445e8a0..80e66652c 100644 --- a/makefile.mingw +++ b/makefile.mingw @@ -86,6 +86,11 @@ tune: $(LIBNAME_S) $(MAKE) -C etc tune $(MAKE) +graphs: $(LIBNAME) + $(MAKE) -C etc graphs + $(MAKE) + + clean: @-cmd /c del /Q /S *.o *.a *.exe *.dll 2>nul diff --git a/makefile.msvc b/makefile.msvc index 8feb425c4..4fa88517e 100644 --- a/makefile.msvc +++ b/makefile.msvc @@ -86,6 +86,10 @@ tune: $(LIBMAIN_S) $(MAKE) -C etc tune $(MAKE) +graphs: $(LIBNAME) + $(MAKE) -C etc graphs + $(MAKE) + clean-obj: @-cmd /c del /Q /S *.OBJ 2>nul diff --git a/makefile.shared b/makefile.shared index 50c335269..7bf0e1aae 100644 --- a/makefile.shared +++ b/makefile.shared @@ -92,8 +92,19 @@ $(foreach demo, $(strip $(DEMOS)), $(eval $(call DEMO_template,$(demo)))) mtest: cd mtest ; $(CC) $(LTM_CFLAGS) -O0 mtest.c $(LTM_LDFLAGS) -o mtest -tune: $(LIBNAME) - $(LTCOMPILE) $(LTM_CFLAGS) -c etc/tune.c -o etc/tune.o - $(LTLINK) $(LTM_LDFLAGS) -o etc/tune etc/tune.o $(LIBNAME) - cd etc/; /bin/sh tune_it.sh; cd .. +# etc/tune.c uses hidden symbols, so we assume that only difference between the shared and static +# libraries is just that: they are shared or static and simply build a static library to run +# etc/tune +tune: + $(MAKE) -f makefile tune + $(MAKE) -f makefile clean $(MAKE) -f makefile.shared + +graphs: + $(MAKE) -f makefile graphs + $(MAKE) -f makefile clean + $(MAKE) -f makefile.shared + + + + diff --git a/makefile.unix b/makefile.unix index 58642098d..653b6a5ec 100644 --- a/makefile.unix +++ b/makefile.unix @@ -85,6 +85,10 @@ tune: $(LIBMAIN_S) $(MAKE) -C etc tune $(MAKE) +graphs: $(LIBNAME) + $(MAKE) -C etc graphs + $(MAKE) + #NOTE: this makefile works also on cygwin, thus we need to delete *.exe clean: -@rm -f $(OBJECTS) $(LIBMAIN_S) diff --git a/makefile_include.mk b/makefile_include.mk index d47ea2ba2..4aadc770f 100644 --- a/makefile_include.mk +++ b/makefile_include.mk @@ -168,7 +168,6 @@ check: test ./test #make the code coverage of the library -# coverage: LTM_CFLAGS += -fprofile-arcs -ftest-coverage -DTIMING_NO_LOGS coverage: LTM_LFLAGS += -lgcov coverage: LTM_LDFLAGS += -lgcov @@ -192,8 +191,9 @@ cleancov: cleancov-clean clean clean: rm -f *.gcda *.gcno *.gcov *.bat *.o *.a *.obj *.lib *.exe *.dll etclib/*.o \ demo/*.o test timing mtest_opponent mtest/mtest mtest/mtest.exe tuning_list \ - *.s tommath_amalgam.c pre_gen/tommath_amalgam.c *.da *.dyn *.dpi tommath.tex \ + *.s tommath_amalgam.c *.da *.dyn *.dpi tommath.tex \ + cmake_install.cmake Makefile \ `find . -type f | grep [~] | xargs` *.lo *.la - rm -rf .libs/ demo/.libs + rm -rf .libs/ demo/.libs CMakeFiles pre_gen ${MAKE} -C etc/ clean MAKE=${MAKE} ${MAKE} -C doc/ clean MAKE=${MAKE}