diff --git a/app/configs/config_abbreviations.json b/app/configs/config_abbreviations.json new file mode 100644 index 00000000..81f7e306 --- /dev/null +++ b/app/configs/config_abbreviations.json @@ -0,0 +1,25 @@ +{ + "common_abbr": [ + "СССР", "РФ", "США", "ВКР", "ИТ", "ПО", "ООО", "ЗАО", "ОАО", "HTML", "CSS", + "JS", "ЛЭТИ", "МОЕВМ", "ЭВМ", "ГОСТ", "DVD", "ИИ", "ОБЗОР", + "ООП", "ЛР", "КР", "ОТЧЕТ", "ПЛАН", "СЛОВА", "ЦПУ", "МБ", "ОЗУ", "КБ", + "SSD", "PC", "HDD", + "AX", "BX", "CX", "DX", "SI", "DI", "BP", "SP", + "AH", "AL", "BH", "BL", "CH", "CL", "DH", "DL", + "CS", "DS", "ES", "SS", "FS", "GS", + "IP", "EIP", "RIP", "URL", + "CF", "PF", "AF", "ZF", "SF", "TF", "IF", "DF", "OF", + "EAX", "EBX", "ECX", "EDX", "ESI", "EDI", "EBP", "ESP", + "RAX", "RBX", "RCX", "RDX", "RSI", "RDI", "RBP", "RSP", + "DOS", "OS", "BIOS", "UEFI", "MBR", "GPT", + "ASCII", "UTF", "UNICODE", "ANSI", + "ЭВМ", "МОЭВМ", + "CPU", "GPU", "APU", "RAM", "ROM", "PROM", "EPROM", "EEPROM", + "USB", "SATA", "PCI", "PCIe", "AGP", "ISA", "VGA", "HDMI", "DP", + "LAN", "WAN", "WLAN", "VPN", "ISP", "DNS", "DHCP", "TCP", "UDP", "IP", + "HTTP", "HTTPS", "FTP", "SSH", "SSL", "TLS", "XP", "ELF", "ACM", "IEEE", "UX", + "API", "GUI", "CLI", "IDE", "SDK", "SQL", "NoSQL", "XML", "JSON", "YAML", + "MAC", "IBM", "CERF", "LTR", "RTL", "FPS", "SHA", "AR", "EN", "RU", + "CREAT", "FIFO", "RSS", "UML", "UI", "GB", "IJGBL" + ] +} \ No newline at end of file diff --git a/app/main/check_packs/pack_config.py b/app/main/check_packs/pack_config.py index d31bc3c1..696b9188 100644 --- a/app/main/check_packs/pack_config.py +++ b/app/main/check_packs/pack_config.py @@ -22,6 +22,7 @@ ['pres_image_capture'], ['task_tracker'], ['overview_in_tasks'], + ['pres_abbreviations_check'], ['pres_aspect_ratio_check'], ['pres_was_were_check'], ] @@ -53,6 +54,7 @@ ["empty_task_page_check"], ["water_in_the_text_check"], ["report_task_tracker"], + ["report_abbreviations_check"], ["report_was_were_check"], ] diff --git a/app/main/checks/check_abbreviations.py b/app/main/checks/check_abbreviations.py new file mode 100644 index 00000000..caa6d117 --- /dev/null +++ b/app/main/checks/check_abbreviations.py @@ -0,0 +1,137 @@ +import json +import re +from pathlib import Path + +from pymorphy3 import MorphAnalyzer + +morph = MorphAnalyzer() + +DEBUG_MODE = False + + +def load_abbreviations(): + config_path = Path(__file__).parent.parent.parent / "configs" / "config_abbreviations.json" + with open(config_path, "r", encoding="utf-8") as f: + data = json.load(f) + return set(data.get("common_abbr")) + + +COMMON_ABBR = load_abbreviations() + + +def debug_print(*args, **kwargs): + if DEBUG_MODE: + print(*args, **kwargs) + + +def get_first_letters(phrase): + if not phrase: + return "" + words = phrase.split() + return "".join(word[0].upper() for word in words if word) + + +def is_abbreviation_explained(abbr: str, text: str) -> bool: + patterns = [ + rf"{abbr}\s*\(([^)]+)\)", # АААА (расшифровка) + rf"\(([^)]+)\)\s*{abbr}", # (расшифровка) АААА + rf"{abbr}\s*[—\-]\s*([^.,;!?]+)", # АААА — расшифровка + rf"{abbr}\s*-\s*([^.,;!?]+)", # АААА - расшифровка + rf"([^.,;!?]+)\s*[—\-]\s*{abbr}", # расшифровка — АААА + rf"([^.,;!?]+)\s*-\s*{abbr}", # расшифровка - АААА + ] + + debug_print(f"Проверка аббревиатуры: {abbr}") + debug_print(f"Текст (первые 200 символов): {text[:200]}...") + + for pattern in patterns: + match = re.search(pattern, text, re.IGNORECASE) + + if match: + explanation = match.group(1) + debug_print(f" Найден паттерн {pattern}") + debug_print(f" Расшифровка: {explanation}") + + if correctly_explained(abbr, explanation): + debug_print(" Расшифровка корректна") + return True + else: + debug_print(" Расшифровка НЕ соответствует первым буквам") + debug_print(f" Ожидалось: {abbr.upper()}") + debug_print(f" Получено: {get_first_letters(explanation)}") + + debug_print(f" Расшифровка для {abbr} не найдена") + return False + + +def get_unexplained_abbrev(text, unverifiable_text): + abbreviations = find_abbreviations(text, unverifiable_text) + + if not abbreviations: + return False, [] + + unexplained_abbr = [] + for abbr in abbreviations: + if not is_abbreviation_explained(abbr, text): + unexplained_abbr.append(abbr) + + return True, unexplained_abbr + + +def find_abbreviations(text: str, unverifiable_text: str): + pattern = r"\b[А-ЯA-Z]{2,5}\b" + abbreviations = re.findall(pattern, text) + + filtered_abbr = { + abbr + for abbr in abbreviations + if abbr not in COMMON_ABBR and abbr not in unverifiable_text and morph.parse(abbr.lower())[0].score != 0 + } + + return list(filtered_abbr) + + +def correctly_explained(abbr, explan): + words = explan.split() + + first_letters = "" + for word in words: + if word: + first_letters += word[0].upper() + + return first_letters == abbr.upper() + + +def main_check(text: str, unverifiable_text: str): + try: + debug_print(f"unverifiable_text : {unverifiable_text}") + continue_check = True + res_str = "" + if not text: + continue_check, res_str = False, "Не удалось получить текст" + + abbr_is_finding, unexplained_abbr = get_unexplained_abbrev(text=text, unverifiable_text=unverifiable_text) + + if not abbr_is_finding: + continue_check, res_str = ( + False, + "Аббревиатуры не найдены в представленном документе", + ) + + if not unexplained_abbr: + continue_check, res_str = False, "Все аббревиатуры правильно расшифрованы" + + return continue_check, res_str, unexplained_abbr + + except Exception as e: + return False, f"Ошибка при проверке аббревиатур: {str(e)}", {} + + +def forming_response(unexplained_abbr_with_page, format_page_link): + result_str = "Найдены нерасшифрованные аббревиатуры при первом использовании:
" + page_links = format_page_link(list(unexplained_abbr_with_page.values())) + for index_links, abbr in enumerate(unexplained_abbr_with_page): + result_str += f"- {abbr} на {page_links[index_links]} странице
" + result_str += "Каждая аббревиатура должна быть расшифрована при первом использовании в тексте.
" + result_str += "Расшифровка должны быть по первыми буквам, например, МВД - Министерство внутренних дел.
" + return result_str diff --git a/app/main/checks/presentation_checks/abbreviations_presentation.py b/app/main/checks/presentation_checks/abbreviations_presentation.py new file mode 100644 index 00000000..19870a8b --- /dev/null +++ b/app/main/checks/presentation_checks/abbreviations_presentation.py @@ -0,0 +1,37 @@ +from ..base_check import BasePresCriterion, answer +from ..check_abbreviations import forming_response, main_check + + +class PresAbbreviationsCheck(BasePresCriterion): + _description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании." + id = "pres_abbreviations_check" + warning = True + + def __init__(self, file_info): + super().__init__(file_info) + + def check(self): + try: + slides_text = self.file.get_text_from_slides() + title_page = slides_text[0] + full_text = " ".join(slides_text) + + continue_check, res_str, unexplained_abbr = main_check(text=full_text, unverifiable_text=title_page) + if not continue_check: + return answer(True, res_str) + + unexplained_abbr_with_slides = {} + + for slide_num, slide_text in enumerate(slides_text, 0): + for abbr in unexplained_abbr: + if abbr in slide_text and abbr not in unexplained_abbr_with_slides: + unexplained_abbr_with_slides[abbr] = slide_num + + if not unexplained_abbr_with_slides: + return answer(True, "Все аббревиатуры правильно расшифрованы") + + result_str = forming_response(unexplained_abbr_with_slides, lambda pages: self.format_page_link(pages)) + return answer(False, result_str) + + except Exception as e: + return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") diff --git a/app/main/checks/report_checks/abbreviations_check.py b/app/main/checks/report_checks/abbreviations_check.py new file mode 100644 index 00000000..eb63bcb0 --- /dev/null +++ b/app/main/checks/report_checks/abbreviations_check.py @@ -0,0 +1,89 @@ +from ..base_check import BaseReportCriterion, answer +from ..check_abbreviations import forming_response, main_check + + +class ReportAbbreviationsCheck(BaseReportCriterion): + label = "Проверка расшифровки аббревиатур" + _description = "Аббревиатуры в тексте должны быть расшифрованы при первом использовании." + id = "report_abbreviations_check" + warning = True + + def __init__(self, file_info): + super().__init__(file_info) + + def check(self): + try: + text = self._get_document_text() + + headings = [ + "СПИСОК ИСПОЛЬЗОВАННЫХ ИСТОЧНИКОВ", + "ПРИЛОЖЕНИЕ", + "ОПРЕДЕЛЕНИЯ, ОБОЗНАЧЕНИЯ И СОКРАЩЕНИЯ", + ] + unverifiable_text = self._get_unverifiable_text(headings) + + continue_check, res_str, unexplained_abbr = main_check(text=text, unverifiable_text=unverifiable_text) + if not continue_check: + return answer(True, res_str) + + unexplained_abbr_with_page = {} + + for page_num in range(1, self.file.page_counter() + 1): + text_on_page = self.file.pdf_file.text_on_page[page_num] + + for abbr in unexplained_abbr: + if abbr in text_on_page and abbr not in unexplained_abbr_with_page: + unexplained_abbr_with_page[abbr] = page_num + + if not unexplained_abbr_with_page: + return answer(True, "Все аббревиатуры правильно расшифрованы") + result_str = forming_response(unexplained_abbr_with_page, lambda pages: self.format_page_link(pages)) + return answer(False, result_str) + + except Exception as e: + return answer(False, f"Ошибка при проверке аббревиатур: {str(e)}") + + def _get_document_text(self): + + if hasattr(self.file, "pdf_file"): + page_texts = self.file.pdf_file.get_text_on_page() + return " ".join(page_texts.values()) + elif hasattr(self.file, "paragraphs"): + text_parts = [] + for paragraph in self.file.paragraphs: + text = paragraph.to_string() + if "\n" in text: + text = text.split("\n")[1] + text_parts.append(text) + return "\n".join(text_parts) + return None + + def _get_text_into_sections(self, headings): + chapters = self.file.make_chapters(self.file_type["report_type"]) + text_parts = [] + + for chapter in chapters: + chapter_title = chapter.get("text", "").upper() + + if any(stop.upper() in chapter_title for stop in headings): + text_parts.append(chapter["text"]) + + def add_child_text(child_elements): + for child in child_elements: + if child.get("text"): + text_parts.append(child["text"]) + if child.get("child"): + add_child_text(child["child"]) + + if chapter.get("child"): + add_child_text(chapter["child"]) + + return " ".join(text_parts) + + def _get_text_title_page(self): + title_page = self.file.pdf_file.text_on_page[1] + return title_page + + def _get_unverifiable_text(self, unverifiable_headings): + unverifiable_text = self._get_text_title_page() + self._get_text_into_sections(unverifiable_headings) + return unverifiable_text diff --git a/app/main/checks/report_checks/literature_references.py b/app/main/checks/report_checks/literature_references.py index ba550d3d..8d95bd59 100644 --- a/app/main/checks/report_checks/literature_references.py +++ b/app/main/checks/report_checks/literature_references.py @@ -156,7 +156,7 @@ def search_references(self, start_par): match = re.search(r'Таблица ([.\d]+)', paragraph_text) table_text = '' if match: - index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic + index_table += 1 # int(match.group(1)) - 1 # TODO: fix logic table_text = self.get_text_in_table(index_table) paragraph_text += table_text diff --git a/app/main/checks/report_checks/main_page_settings.py b/app/main/checks/report_checks/main_page_settings.py index e0ed9f06..bd50dab0 100644 --- a/app/main/checks/report_checks/main_page_settings.py +++ b/app/main/checks/report_checks/main_page_settings.py @@ -60,7 +60,9 @@ class ReportMainPageSetting: "found_value": 0, "found_key": 0, "find": 3, - "value": [r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)"], # + "value": [ + r"(Руководитель).*([кд]\..+\.н\., (доцент|профессор))[|]*([А-Я](?:\.-?[А-Я])?\.[А-Я]\. [А-Я][а-я]+)" + ], # "logs": "", }, { diff --git a/app/utils/converter.py b/app/utils/converter.py index 821cc976..2c051eba 100644 --- a/app/utils/converter.py +++ b/app/utils/converter.py @@ -9,11 +9,14 @@ def run_process(cmd: str): def convert_to(filepath, target_format='pdf'): new_filename, outdir = None, dirname(filepath) - convert_cmd = "timeout 3m " + { - 'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}", - 'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}", - 'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}", - }[target_format] + convert_cmd = ( + "timeout 3m " + + { + 'pdf': f"soffice --headless --convert-to pdf --outdir {outdir} {filepath}", + 'docx': f"soffice --headless --convert-to docx --outdir {outdir} {filepath}", + 'pptx': f"soffice --headless --convert-to pptx --outdir {outdir} {filepath}", + }[target_format] + ) if run_process(convert_cmd).returncode == 0: # success conversion