@@ -148,6 +148,14 @@ def select_old():
148148
149149 return '' # Return an empty string because you have to return something
150150
151+ @app .route ("/removeUploadLabels" , methods = ["GET" , "POST" ]) # Tells Flask to handle ajax request from '/scrub'
152+ def removeUploadLabels ():
153+ """
154+ Removes Scrub upload files from the session when the labels are clicked.
155+ """
156+ option = request .headers ["option" ]
157+ session ['scrubbingoptions' ]['optuploadnames' ][option ] = ''
158+ return "success"
151159
152160@app .route ("/scrub" , methods = ["GET" , "POST" ]) # Tells Flask to load this function when someone is at '/scrub'
153161def scrub ():
@@ -946,6 +954,169 @@ def manage():
946954 managers .utility .saveFileManager (fileManager )
947955 return '' # Return an empty string because you have to return something
948956
957+ @app .route ("/gutenberg" , methods = ["GET" , "POST" ]) # Tells Flask to load this function when someone is at '/module'
958+ def gutenberg ():
959+ """
960+ Generic module for saving text stored as a variable to the file manager. It mostly just illustrates how
961+ to access the file manager.
962+ """
963+ fileManager = managers .utility .loadFileManager ()
964+
965+ if request .method == "GET" :
966+ # "GET" request occurs when the page is first loaded.
967+
968+ # Get a dictionary of the currently active files' labels.
969+ labels = fileManager .getActiveLabels ()
970+
971+ message = "Submit to load file"
972+
973+ return render_template ('gutenberg.html' , message = message )
974+
975+ if request .method == "POST" :
976+ # "POST" request occur when html form is submitted
977+ labels = fileManager .getActiveLabels ()
978+
979+ # Get the request variable
980+ s = request .form ["urls" ]
981+ formLines = [l for l in s .split ("\n " ) if l ]
982+
983+ #import os, urllib # imported by lexos.py
984+ import re , shutil , urllib
985+
986+ remove = ["Produced by" ,"End of the Project Gutenberg" ,"End of Project Gutenberg" ]
987+ savedFiles = "<ol>"
988+
989+ ''' Reads a raw Project Gutenberg etext, reformat paragraphs,
990+ and removes fluff. Determines the title of the book and uses it
991+ as a filename to write the resulting output text. '''
992+ for url in formLines :
993+ f = urllib .urlopen (url )
994+ data = f .readlines ()
995+ f .close ()
996+ lines = [line .strip () for line in data ]
997+ collect = False
998+ lookforsubtitle = False
999+ outlines = []
1000+ startseen = endseen = False
1001+ authorLastName = ""
1002+ title = ""
1003+ one = "<?xml version=\" 1.0\" encoding=\" utf-8\" ?><TEI xmlns=\" http://www.tei-c.org/ns/1.0\" version=\" 5.0\" ><teiHeader><fileDesc><titleStmt>"
1004+ two = "</titleStmt><publicationStmt><publisher></publisher><pubPlace></pubPlace><availability status=\" free\" ><p>Project Gutenberg</p></availability></publicationStmt><seriesStmt><title>Project Gutenberg Full-Text Database</title></seriesStmt><sourceDesc default=\" false\" ><biblFull default=\" false\" ><titleStmt>"
1005+ three = "</titleStmt><extent></extent><publicationStmt><publisher></publisher><pubPlace></pubPlace><date></date></publicationStmt></biblFull></sourceDesc></fileDesc><encodingDesc><editorialDecl default=\" false\" ><p>Preliminaries omitted.</p></editorialDecl></encodingDesc></teiHeader><text><body><div>"
1006+ for line in lines :
1007+ if line .startswith ("Author: " ):
1008+ author = line [8 :]
1009+ authorLastName = author
1010+ authorTemp = line [8 :]
1011+ continue
1012+ if line .startswith ("Title: " ):
1013+ title = line [7 :]
1014+ titleTemp = line [7 :]
1015+ lookforsubtitle = True
1016+ continue
1017+ if lookforsubtitle :
1018+ if not line .strip ():
1019+ lookforsubtitle = False
1020+ else :
1021+ subtitle = line .strip ()
1022+ subtitle = subtitle .strip ("." )
1023+ title += ", " + subtitle
1024+ if ("*** START" in line ) or ("***START" in line ):
1025+ collect = startseen = True
1026+ paragraph = ""
1027+ continue
1028+ if ("*** END" in line ) or ("***END" in line ):
1029+ endseen = True
1030+ break
1031+ if not collect :
1032+ continue
1033+ if (titleTemp ) and (authorTemp ):
1034+ outlines .append (one )
1035+ outlines .append ("<title>" )
1036+ outlines .append (titleTemp )
1037+ outlines .append ("</title>" )
1038+ outlines .append ("<author>" )
1039+ outlines .append (authorTemp )
1040+ outlines .append ("</author>" )
1041+ outlines .append (two )
1042+ outlines .append ("<title>" )
1043+ outlines .append (titleTemp )
1044+ outlines .append ("</title>" )
1045+ outlines .append ("<author>" )
1046+ outlines .append (authorTemp )
1047+ outlines .append ("</author>" )
1048+ outlines .append (three )
1049+ authorTemp = False
1050+ titleTemp = False
1051+ continue
1052+ if not line :
1053+ paragraph = paragraph .strip ()
1054+ for term in remove :
1055+ if paragraph .startswith (term ):
1056+ paragraph = ""
1057+ if paragraph :
1058+ paragraph = paragraph .replace ("&" , "&" )
1059+ outlines .append (paragraph )
1060+ outlines .append ("</p>" )
1061+ paragraph = "<p>"
1062+ else :
1063+ paragraph += " " + line
1064+
1065+ # Get author lastname
1066+ authorLastName = authorLastName .split (" " )
1067+ authorLastName = authorLastName [- 1 ].lower ()
1068+
1069+ # Get short title
1070+ shortTitle = title .replace (":" , "_" )
1071+ shortTitle = shortTitle .replace ("," , "_" )
1072+ shortTitle = shortTitle .replace (" " , "" )
1073+ first_cap_re = re .compile ('(.)([A-Z][a-z]+)' )
1074+ all_cap_re = re .compile ('([a-z0-9])([A-Z])' )
1075+ shortTitle = first_cap_re .sub (r'\1_\2' , shortTitle )
1076+ shortTitle = all_cap_re .sub (r'\1_\2' , shortTitle ).lower ()
1077+ shortTitle = shortTitle .replace ("__" , "_" )
1078+
1079+ # Compose a filename. Replace some illegal file name characters with alternatives.
1080+ filename = url .split ("/" )
1081+ ofn = filename [- 1 ]
1082+ ofn = authorLastName + "_" + shortTitle [:150 ] + ".xml"
1083+ ofn = ofn .replace ("&" , "" )
1084+ ofn = ofn .replace ("/" , "" )
1085+ ofn = ofn .replace ("\" " , "" )
1086+ ofn = ofn .replace (":" , "" )
1087+ ofn = ofn .replace ("," , "" )
1088+ ofn = ofn .replace (" " , "" )
1089+ ofn = ofn .replace ("txt" , "xml" )
1090+
1091+ outlines .append ("</div></body></text></TEI>" )
1092+ text = "\n " .join (outlines )
1093+ text = re .sub ("End of the Project Gutenberg .*" , "" , text , re .M )
1094+ text = re .sub ("Produced by .*" , "" , text , re .M )
1095+ text = re .sub ("<p>\s+<\/p>" , "" , text )
1096+ text = re .sub ("\s+" , " " , text )
1097+
1098+ # Save the file to the file manager
1099+ savedFiles += "<li>" + ofn + "</li>"
1100+ fileManager .addUploadFile (text , ofn )
1101+
1102+ # Read from a list of urls
1103+ #outputDir = "/Path/to/your/ProjectGutenberg/TEI/Output/files/"
1104+ #urls = ['http://www.gutenberg.org/cache/epub/42324/pg42324.txt']
1105+ #for url in urls:
1106+ # ofn, text = beautify(url, outputDir, url)
1107+ # print(ofn+":")
1108+ # print(text[:10000])
1109+
1110+ # Save the file to the file manager
1111+ #fileManager.addUploadFile(doc, fileName)
1112+
1113+ message = savedFiles + "</ol>"
1114+
1115+ # Save the file manager
1116+ managers .utility .saveFileManager (fileManager )
1117+
1118+ return render_template ('gutenberg.html' , message = message )
1119+
9491120# ======= End of temporary development functions ======= #
9501121
9511122install_secret_key ()
0 commit comments