Feature: cli support -c clean mode

hiddenblue · hiddenblue · commit 8be9b24e3cde · 2024-12-18T01:04:36.000+08:00
diff --git a/.gitignore b/.gitignore
@@ -7,7 +7,7 @@ __pycache__/
 document/
 *.xls
 test.py
-pubmedsql
+pubmedsql.db
 
 
 # C extensions
diff --git a/README.adoc b/README.adoc
@@ -90,6 +90,7 @@ optional arguments:
   --output      -o    add --output filename to appoint name of pdf file
   --loglevel    -l    set the console log level, e.g debug
   --yes         -Y    add --yes or -Y to skip the confirmation process before search
+  --clean       -c    clean the output directory and sqlite history table
 ----
 
 *If you are familiar with IDEs, you can run `main.py` in Python environments such as `pycharm` or `vscode`.*
diff --git a/README_CN.adoc b/README_CN.adoc
@@ -97,6 +97,7 @@ optional arguments:
 
 --loglevel    -l    set the console log level, e.g -l debug
 --yes         -Y    add --yes to skip the confirmation process before searching
+--clean       -c    clean the output directory and sqlite history table
 ----
 
 _如果你熟悉IDE的话，可以在pycharm或者vscode等python环境下运行main.py_
diff --git a/config.py b/config.py
@@ -9,8 +9,9 @@ def __init__(self):
         self.savetime: str = time.strftime("%Y%m%d%H%M%S")
         self.feedbacktime: float = 1.5
         self.pdfSavePath: str = "./document/pub"
+        self.dbpath: str = "./pubmedsql.db"
 
-        # 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息，默认50啦
+        # 这个参数用于GetEachInfo决定一次性通过异步下载多少页面的信息，默认50啦
         self.InfoBatchSize: int = 50
         self.PDF_BatchSize: int = 5
 
diff --git a/main.py b/main.py
@@ -5,15 +5,17 @@
 from time import sleep
 
 from GetEachInfo import geteachinfo
-from GetSearchResult import spiderpub
+from GetSearchResult import searchEntry
 from config import ProjectInfo, projConfig
 from utils.Commandline import MedCli
 from utils.ExcelHelper import ExcelHelper
 from utils.LogHelper import medLog, MedLogger
 from utils.PDFHelper import PDFHelper
 from utils.WebHelper import WebHelper
 
+# 从config.py当中导入一些配置信息，后续可能会被cli的参数override
 feedbacktime = projConfig.feedbacktime
+dbpath = projConfig.dbpath
 
 
 def printSpliter(length=25):
@@ -81,6 +83,8 @@ def printSpliter(length=25):
                         help='add --output or -o to specify output path of pdf file '
                              ' For example, -o pmc7447651.pdf. Default is PMCxxxxxx.pdf',
                         default='None')
+    
+    # 调试 清理 相关的参数
 
     parser.add_argument("-l", "--loglevel", metavar='',
                         choices=('debug', 'info', 'warning', 'error', 'critical'),
@@ -93,30 +97,46 @@ def printSpliter(length=25):
                         help='add --yes or -Y to skip the confirmation process and start searching directly',
                         default=False)
     
+    parser.add_argument("-c", "--clean", action="store_true",
+                        help='clean the output directory and sqlite history table',
+                        default=False)
+    
     # todo 
     # add mutual exclusive group for some args
 
     ####################################################################################################
 
     args = parser.parse_args()
 
+    # 打印项目信息
     # print the hello info
     ProjectInfo.printProjectInfo()
     print("\n")
 
+    # 设置日志级别
     # alter the log level according to the cli args
     # cli first, overriding the config.py
     if args.loglevel is not None:
         loglevel = MedCli.parseLogLevel(args.loglevel)
         projConfig.loglevel = loglevel
         MedLogger.setTerminalLogLevel(medLog, loglevel)
+        
+    # 清理历史记录
+    if args.clean is True:
+        # 默认的excel和txt输出目录应该是在当前文件夹
+        MedCli.cleanHistory(directory="./", dbpath=dbpath, skip=args.yes)
+        medLog.info("Exiting")
+        sleep(feedbacktime)
+        sys.exit()
 
+    # 单篇处理模式
     if args.keyword is None and (args.pmcid, args.pmid):
         # 关键词为空，进入单篇处理模式
         MedCli.SingleArticleMode(pmcid=args.pmcid, pmid=args.pmid)
     else:
         pass
 
+    # 设置保存目录
     # check the directory variable. the path variable from cli is preferred.
     # the default pdf saving directory path is from config.py which is './document/pub'
     if args.directory is not None:
@@ -127,44 +147,48 @@ def printSpliter(length=25):
             medLog.error("Please check your config.py and cli parameter." "The program will exit.")
             sys.exit()
 
+    # 检查关键词
     if args.keyword.isspace() or args.keyword.isnumeric():
         medLog.error("pubmedsoso search keyword error\n")
         medLog.error("the program will exit.")
         sleep(feedbacktime)
 
     ######################################################################################################
 
+    # 输出当前参数
     medLog.info(f"Current commandline parameters: {args.__dict__}\n")
     medLog.info(
         f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.pagenum}, 年份：{args.year}, 文献下载数量: {args.downloadnum}, 下载文献的存储目录: {projConfig.pdfSavePath}\n")
+    
+    # 获取搜索结果数量
     try:
         result_num = WebHelper.GetSearchResultNum(keyword=args.keyword, year=args.year)
     except Exception as err:
         raise
 
     medLog.info("当前关键词在pubmed检索到的相关结果数量为: %s\n" % result_num)
-    
+
+    # 确认开始执行程序
     # add --yes parameter to skip the confirmation
-    if args.Yes is True:
+    if args.yes is True:
         pass
     else:
-
         medLog.info("是否要根据以上参数开始执行程序？y or n\n")
         startFlag = input()
-        if startFlag == 'y' or startFlag == 'Y' or startFlag == 'Yes':
-            pass
-        if startFlag in ["n", "N", "No", "no"]:
+            
+        if startFlag not in ['y', 'Y', 'Yes']:
             medLog.critical("程序终止执行\n\n")
             sleep(feedbacktime * 0.5)
             sys.exit()
-
+        
     ######################################################################################################
 
     printSpliter()
     medLog.info("程序已运行，开始检查数据储存目录\n")
     printSpliter()
     sleep(0.5)
 
+    # 检查保存目录
     if os.path.exists(projConfig.pdfSavePath):
         medLog.info("文件储存目录检查正常，可以储存文件\n")
     else:
@@ -176,30 +200,33 @@ def printSpliter(length=25):
 
     sleep(feedbacktime)
 
-    dbpath = "./pubmedsql"
     # ?term=cell%2Bblood&filter=datesearch.y_1&size=20
 
     # 根据上面输入的关键词初始化生成url参数
+    # 解析URL参数
     ParamDict = WebHelper.parseParamDcit(keyword=args.keyword, year=args.year)
     encoded_param = WebHelper.encodeParam(ParamDict)
 
-    # 从此处开始爬取数据
 
     printSpliter()
-
-    spiderpub(encoded_param, args.pagenum, result_num)
+    
+    # 爬取搜索结果
+    searchEntry(encoded_param, args.pagenum, result_num)
 
     printSpliter()
     medLog.info("\n\n爬取搜索结果完成，开始执行单篇检索，耗时更久\n\n")
 
+    # 获取每篇文章信息
     geteachinfo(dbpath)
-
+    
     printSpliter()
     medLog.info("\n\n爬取搜索结果完成，开始执行文献下载，耗时更久\n\n")
 
+    # 下载PDF
     # PDFHelper.PDFBatchDonwload(args.download_num)
     PDFHelper.PDFBatchDownloadEntry(args.downloadnum)
-
+    
+    # 生成Excel表格
     ExcelHelper.PD_To_excel(dbpath, override=True)
     medLog.info("爬取最终结果信息已经自动保存到excel表格中，文件名为%s" % ExcelHelper.tablename)
     medLog.info(f"爬取的所有文献已经保存到{projConfig.pdfSavePath}目录下")
diff --git a/utils/Commandline.py b/utils/Commandline.py
@@ -1,8 +1,13 @@
 import logging
 import sys
+from time import sleep
 
+from utils.Clean import clean_files, clean_sqlite
+from config import projConfig
 from utils.LogHelper import medLog
 
+feedbacktime = projConfig.feedbacktime
+
 
 class MedCli:
 
@@ -39,3 +44,31 @@ def SingleArticleMode(**kwargs):
 
         medLog.warning("The program is exiting.\n")
         sys.exit(0)
+
+    @staticmethod
+    def cleanHistory(directory: str, dbpath: str, **kwargs):
+
+        medLog.warning("The clean.py is up")
+        medLog.info("The target directory  is \"%s\"" % directory)
+        medLog.info("The target database path is \"%s\"" % dbpath)
+        sleep(feedbacktime)
+
+        if kwargs.get('skip', None) is not None and kwargs.get('skip') is True:
+            # skip the confirmation process when -Y is enabled
+            pass
+        else:
+            medLog.info("是否要根据以上参数执行清理程序？y or n\n")
+            startFlag = input()
+
+            if startFlag not in ['y', 'Y', 'Yes']:
+                medLog.critical("程序终止执行\n\n")
+                sleep(feedbacktime * 0.5)
+                sys.exit()
+
+        # 清理文件
+        clean_files(directory)
+        # 清理数据库当中的旧表
+        clean_sqlite(dbpath)
+        # 运行主要命令
+        # run_main_command()
+        medLog.warning("The clean.py is down")
diff --git a/utils/ExcelHelper.py b/utils/ExcelHelper.py
@@ -14,6 +14,7 @@
 class ExcelHelper:
     savepath: str = f'./pubmed-{projConfig.savetime}.xlsx'
     tablename: str = f'pubmed{projConfig.savetime}'
+    dbpath = projConfig.dbpath
     # 原始列名和新列名组成的字典
     rename_dict = {
         'id': '序号',
@@ -106,13 +107,13 @@ def PD_To_excel(cls, dbpath: str, override=False) -> None:
         except Exception as e:
             medLog.error(f"\n爬取数据库信息保存到Excel失败: {e}\n")
 
-    @staticmethod
-    def local_export():
+    @classmethod
+    def local_export(cls):
         # todo
         # 将excel导出功能整合到cli当中
         import DBHelper
 
-        dbpath: str = 'pubmedsql'
+        dbpath: str = cls.dbpath
         table_list: list = DBHelper.DBTableFinder(dbpath)
         if not table_list:
             medLog.critical("目标数据库不存在或者内容为空，请检查数据库，即将退出")
diff --git a/utils/PDFHelper.py b/utils/PDFHelper.py
@@ -22,6 +22,13 @@
 # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9034016/pdf/main.pdf
 
 class PDFHelper:
+    """
+    处理PDF下载 保存相关逻辑的代码
+    主要都是classmethod或者staticmethod
+    
+    """
+    
+    # 类变量
     baseurl = "http://www.ncbi.nlm.nih.gov/"
     # 没有采用https是因为听说https的审查会增加延时
     headers = {
@@ -38,9 +45,8 @@ class PDFHelper:
         'sec-fetch-user': '?1',
         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
     }
-
-    def __init__(self, dbpath):
-        self.dbpath = dbpath
+    dbpath = projConfig.dbpath
+    tablename = 'pubmed%s' % projConfig.savetime
 
     @staticmethod
     def handle_error(e):
@@ -69,8 +75,8 @@ def PDFBatchDownloadEntry(cls, limit):
         异步批量处理的pdf下载函数
         感觉写得稀烂啊
         """
-        tablename = 'pubmed%s' % projConfig.savetime
-        dbpath = 'pubmedsql'
+        tablename = cls.tablename
+        dbpath = cls.dbpath
         # 注意这个列表的数据类型，和名称并不是相符的
         # 这个返回的结果是有免费全文的，包括 FreeArticle 和 FreePMCArticle 两类
         free_article_list: [TempPMID] = DBFetchAllFreePMC(dbpath, tablename)
@@ -196,10 +202,10 @@ def FileSave(cls, content: bytes, savepath: str) -> bool:
             medLog.info("open success")
             file.write(content)
             file.close()
-            medLog.info("文件写入成功", "保存路径为%s" % savepath)
+            medLog.info("文件写入成功, 保存路径为%s" % savepath)
             return True
         except Exception as e:
-            medLog.error("文件写入失败", "保存路径为%s" % savepath)
+            medLog.error("文件写入失败, 保存路径为%s" % savepath)
             medLog.error(e)
             return False