Merge pull request #8 from hiddenblue/dev

hiddenblue · web-flow · commit cc1b45134741 · 2024-12-12T18:01:51.000+08:00
origin/dev
diff --git a/DBHelper.py b/DBHelper.py
@@ -4,7 +4,7 @@
 from DataType import Publication
 from DataType import SingleDocInfo, TempPMID
 from LogHelper import print_error
-from config import savetime
+from config import projConfig 
 
 
 # 把一些关于sqlite3相关的操作抽象出来了，方便其他模块调用
@@ -173,7 +173,7 @@ def DBWriter(dbpath: str, sql: str, params: tuple = None) -> bool:
 
 # 这个函数是用来保存文献打开页面获取到的单独的信息的
 def DBSaveInfo(singleinfo: SingleDocInfo, dbpath: str):
-    tablename = 'pubmed%s' % savetime
+    tablename = 'pubmed%s' % projConfig.savetime
 
     ret = False
     try:
diff --git a/ExcelHelper.py b/ExcelHelper.py
@@ -6,12 +6,13 @@
 import pandas as pd
 
 from LogHelper import print_error
-from config import savetime, feedbacktime
+from config import projConfig
+feedbacktime = projConfig.feedbacktime
 
 
 class ExcelHelper:
-    savepath: str = f'./pubmed-{savetime}.xlsx'
-    tablename: str = f'pubmed{savetime}'
+    savepath: str = f'./pubmed-{projConfig.savetime}.xlsx'
+    tablename: str = f'pubmed{projConfig.savetime}'
     # 原始列名和新列名组成的字典
     rename_dict = {
         'id': '序号',
@@ -131,6 +132,7 @@ def PD_To_excel(cls, dbpath: str, override=False) -> None:
                 break
             if 1 <= x <= len(table_list):
                 index = table_list[x - 1]
+                # todo
                 savetime = index[6:]
                 ExcelHelper.PD_To_excel(dbpath)
                 print("此次保存执行完成，下一个循环")
diff --git a/GetSearchResult.py b/GetSearchResult.py
@@ -10,7 +10,7 @@
 from DataType import ArticleFreeType, SingleSearchData
 from LogHelper import print_error
 from WebHelper import WebHelper
-from config import savetime
+from config import projConfig
 
 
 def parseSearchHtml(html: str) -> Optional[List[SingleSearchData]]:
@@ -119,7 +119,7 @@ def SaveSearchData(datalist: List[SingleSearchData], dbpath: str) -> None:
         datalist (List[SingleSearchData]): List of parsed search data.
         dbpath (str): Path to the SQLite database.
     """
-    tablename = f'pubmed{savetime}'
+    tablename = f'pubmed{projConfig.savetime}'
     for singleSearchData in datalist:
         try:
             sql = f"""
@@ -182,8 +182,8 @@ def spiderpub(parameter: str, page_limit: int, resultNum: int) -> None:
                 datalist.extend(SingleSearchPageData)
 
     dbpath = 'pubmedsql'
-    tablename = f'pubmed{savetime}'
-    txtname = f"pubmed{savetime}.txt"
+    tablename = f'pubmed{projConfig.savetime}'
+    txtname = f"pubmed{projConfig.savetime}.txt"
 
     try:
         DBHelper.DBCreater(dbpath)
diff --git a/PDFHelper.py b/PDFHelper.py
@@ -11,7 +11,7 @@
 from DBHelper import DBWriter, DBFetchAllFreePMC
 from DataType import TempPMID
 from LogHelper import print_error
-from config import savetime, pdfSavePath
+from config import projConfig
 
 
 # 把一些关于PDF相关的操作抽象出来了，方便其他模块调用
@@ -45,29 +45,29 @@ def handle_error(e):
         print_error("Error occured: %s" % e)
 
     @staticmethod
-    def IsFileExist(path: str) -> bool:
+    def __IsFileExist(path: str) -> bool:
         return Path(path).exists()
 
     @classmethod
-    def IsPDFExist(cls, tempid) -> bool:
-        savepath = cls.GetPDFSavePath(tempid)
-        return PDFHelper.IsFileExist(savepath)
+    def __IsPDFExist(cls, tempid) -> bool:
+        savepath = cls.__GetPDFSavePath(tempid)
+        return PDFHelper.__IsFileExist(savepath)
 
     @classmethod
-    def GetPDFFileName(cls, tempid: TempPMID) -> str:
+    def __GetPDFFileName(cls, tempid: TempPMID) -> str:
         return re.sub(r'[< >/\\|:"*?]', ' ', tempid.doctitle)
 
     @classmethod
-    def GetPDFSavePath(cls, tempid: TempPMID) -> str:
-        return f"{pdfSavePath}/{cls.GetPDFFileName(tempid)}.pdf"
+    def __GetPDFSavePath(cls, tempid: TempPMID) -> str:
+        return f"{projConfig.pdfSavePath}/{cls.__GetPDFFileName(tempid)}.pdf"
 
     @classmethod
     def PDFBatchDownloadEntry(cls, limit):
         """
         异步批量处理的pdf下载函数
         感觉写得稀烂啊
         """
-        tablename = 'pubmed%s' % savetime
+        tablename = 'pubmed%s' % projConfig.savetime
         count = 0
         dbpath = 'pubmedsql'
         PMID_list: [TempPMID] = DBFetchAllFreePMC(dbpath, tablename)
@@ -77,10 +77,10 @@ def PDFBatchDownloadEntry(cls, limit):
         # 过滤掉已经存在于本地的文献
         PMCID_list = []
         for item in temp_list:
-            if cls.IsPDFExist(item):
+            if cls.__IsPDFExist(item):
                 # 存在于目录当中直接更新就行了
-                cls.PDFUpdateDB(item, cls.GetPDFSavePath(item), dbpath)
-                print(f"PDF: {cls.GetPDFFileName(item)} 在保存目录当中已存在，跳过下载")
+                cls.PDFUpdateDB(item, cls.__GetPDFSavePath(item), dbpath)
+                print(f"PDF: {cls.__GetPDFFileName(item)} 在保存目录当中已存在，跳过下载")
             else:
                 PMCID_list.append(item)
 
@@ -95,7 +95,7 @@ def PDFBatchDownloadEntry(cls, limit):
                 status = PDFHelper.PDFSaveFile(item[1], item[0])
                 if (status == True):
                     # 将pdf文件名称以及存储位置等相关信息添加到sqlite数据库当中
-                    PDFHelper.PDFUpdateDB(item[0], cls.GetPDFSavePath(item[0]), dbpath)
+                    PDFHelper.PDFUpdateDB(item[0], cls.__GetPDFSavePath(item[0]), dbpath)
                 else:
                     print_error("保存pdf文件发生错误，自动跳过该文献PMCID为 %s" % item[0].PMCID)
                     continue
@@ -153,26 +153,26 @@ def PDFSaveFile(cls, html, tempid: TempPMID) -> bool:
         将pdf保存到本地文件的功能
         暂时还不确定能否支持异步，就先用同步版本了
         """
-        tablename = 'pubmed%s' % savetime
+        tablename = 'pubmed%s' % projConfig.savetime
         # pdf = html.decode("utf-8")  # 使用Unicode8对二进制网页进行解码，直接写入文件就不需要解码了
 
         try:
-            articleName = cls.GetPDFFileName(tempid)
+            articleName = cls.__GetPDFFileName(tempid)
             # 需要注意的是文件命名中不能含有以上特殊符号，只能去除掉
-            savepath = "%s/%s.pdf" % (pdfSavePath, articleName)
+            savepath = "%s/%s.pdf" % (projConfig.pdfSavePath, articleName)
             file = open(savepath, 'wb')
             print("open success")
             file.write(html)
             file.close()
-            print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % pdfSavePath)
+            print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % projConfig.pdfSavePath)
             return True
         except:
             print_error(f"pdf文件写入失败, 文件ID为 {tempid.PMCID}, 检查路径")
             return False
 
     @classmethod
     def PDFUpdateDB(cls, tempid: TempPMID, savepath: str, dbpath: str) -> bool:
-        tablename = 'pubmed%s' % savetime
+        tablename = 'pubmed%s' % projConfig.savetime
         try:
             writeSql = " UPDATE %s SET savepath = ? WHERE PMCID =?" % tablename
             param = (savepath, tempid.PMCID)
diff --git a/config.py b/config.py
@@ -1,17 +1,54 @@
 # -*- coding: utf-8 -*-
+import shutil
 import time
 
-savetime = time.strftime("%Y%m%d%H%M%S")
-feedbacktime: float = 1.5
-pdfSavePath = "./document/pub"
-
-# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息，默认50啦
-batchsize: int = 50
-
-
 class ProjectInfo:
-    VersionInfo: str = "1.2.0"
+    VersionInfo: str = "1.2.1"
     ProjectName: str = "Pubmedsoso"
-    LastUpdate: str = "20241202"
+    LastUpdate: str = "20241212"
     AuthorName: str = "hiddenblue"
-# 此处的变量是main一次运行中需要多次调用的全局变量
+
+    @classmethod
+    def printProjectInfo(cls):
+        # 动态计算字段名称的最大宽度
+        max_width = max(len(key) for key in cls.__dict__.keys() if not key.startswith("__"))
+
+        # 获取终端宽度
+        terminal_width = shutil.get_terminal_size().columns
+        
+        print("")
+        # 打印居中的欢迎信息
+        welcome_message = "欢迎使用 Pubmedsoso 文献检索工具"
+        print("")
+
+        print(welcome_message.center(terminal_width))
+        
+        # 打印分隔线
+        print("=" * terminal_width)
+
+
+
+        # 打印项目信息
+        for key, value in cls.__dict__.items():
+            if not key.startswith("__") and not callable(value) and not isinstance(value, classmethod):
+                print(f"{key:<{max_width}}: {value}".center(terminal_width))
+
+        # 打印分隔线
+        print("=" * terminal_width)
+        
+class GlobalConfig:
+    def __init__(self):
+        self.savetime: str = time.strftime("%Y%m%d%H%M%S")
+        self.feedbacktime: float = 1.5
+        self.pdfSavePath: str = "./document/pub"
+
+        # 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息，默认50啦
+        self.batchsize: int = 50 
+
+
+# 下面这句在从其他模块导入这个变量执行就会自动执行，并且是一个全局共享的状态
+projConfig = GlobalConfig()
+
+if __name__ == "__main__":
+    
+    ProjectInfo.printProjectInfo()
diff --git a/geteachinfo.py b/geteachinfo.py
@@ -13,7 +13,9 @@
 from ExcelHelper import ExcelHelper
 from LogHelper import print_error
 from WebHelper import WebHelper
-from config import savetime, batchsize
+from config import projConfig
+
+batchsize = projConfig.batchsize
 
 
 def parse_abstract(
@@ -135,7 +137,7 @@ def parse_single_info(html_etree: etree.Element):
 
 
 def geteachinfo(dbpath):
-    tablename = 'pubmed%s' % savetime
+    tablename = 'pubmed%s' % projConfig.savetime
 
     PMID_list = DBFetchAllPMID(dbpath, tablename)
     if PMID_list == None:
diff --git a/main.py b/main.py
@@ -10,7 +10,9 @@
 from GetSearchResult import spiderpub
 from PDFHelper import PDFHelper
 from WebHelper import WebHelper
-from config import ProjectInfo, feedbacktime, pdfSavePath
+from config import ProjectInfo, projConfig
+
+feedbacktime = projConfig.feedbacktime
 
 
 def printSpliter(length=25):
@@ -22,9 +24,9 @@ def printSpliter(length=25):
     # 命令行参数解析
     parser = argparse.ArgumentParser(
         description="pubmedsoso is a python program for crawler article information and download pdf file",
-        usage="python main.py keyword ")
+        usage="python main.py keyword")
 
-    parser.add_argument('--version', '-v', action='version',
+    parser.add_argument('-v', '--version', action='version',
                         version=f'\nCurrent the {ProjectInfo.ProjectName}\n\n version: {ProjectInfo.VersionInfo}\n' +
                                 f'Last updated date: {ProjectInfo.LastUpdate} \n' +
                                 f'Author: {ProjectInfo.AuthorName} \n',
@@ -40,33 +42,48 @@ def printSpliter(length=25):
     parser.add_argument("keyword", type=str,
                         help='specify the keywords to search pubmed\n For example "headache"')
 
-    parser.add_argument("--page_num", "-n", type=int,
-                        help='add --number or -n to specify the page number you wanna to crawl'
-                             'For example --number 10. Default number is 10',
+    parser.add_argument("-n", "--pagenum", type=int, metavar='',
+                        help='add --pagenum or -n to specify the page number of info you wanna to crawl'
+                             'For example --pagenum 10. Default number is 10',
                         default=10)
 
-    parser.add_argument("--year", "-y", type=int,
+    parser.add_argument("-y", "--year", type=int, metavar='',
                         help='add --year or -y to specify year scale you would to search'
                              'For example --year 10. The Default is Not set',
                         default=None)
 
-    parser.add_argument("--download_num", "-d", type=int,
-                        help='add --download_num or -d to specify the doc number you wanna to download'
+    parser.add_argument("-d", "--downloadnum", type=int, metavar='',
+                        help='add --downloadnum or -d to specify the number of pdf you wanna to download'
                              'For example -d 10. Default number is 10',
                         default=10)
+    
+    parser.add_argument("-D", "--directory", type=str,  metavar='',
+                        help='add --directory or -D specify the save path of pdf file'
+                        'For example, -D ./output. Default path is ./document/pub'
+                        'you can overrider the default path in config.py',
+                        default='./document/pub')
     ####################################################################################################
 
     args = parser.parse_args()
 
+    # print the hello info
+    ProjectInfo.printProjectInfo()
+    print("\n")
+    
+    # check the directory variable. the path variable from cli is preferred.
+    # the default pdf saving directory path is from config.py which is './document/pub'
+    if args.directory is not None:
+        projConfig.pdfSavePath = args.directory
+
     if args.keyword.isspace() or args.keyword.isnumeric():
         print("pubmedsoso search keyword error\n")
         sleep(feedbacktime)
 
-    print("\n欢迎使用Pubmedsoso 文件检索工具\n\n")
 
-    print(f"当前使用的命令行参数 {args.__dict__}\n")
+
+    print(f"Current commandline parameters: {args.__dict__}\n")
     print(
-        f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.page_num}, 年份：{args.year}, 文献下载数量:{args.download_num}\n")
+        f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.pagenum}, 年份：{args.year}, 文献下载数量: {args.downloadnum}, 下载文献的存储目录: {projConfig.pdfSavePath}\n")
     try:
         result_num = WebHelper.GetSearchResultNum(args.keyword)
     except Exception as err:
@@ -90,14 +107,14 @@ def printSpliter(length=25):
     printSpliter()
     sleep(0.5)
 
-    if os.path.exists(pdfSavePath):
+    if os.path.exists(projConfig.pdfSavePath):
         print("文件储存目录检查正常，可以储存文件\n")
     else:
-        os.makedirs(pdfSavePath)
-        print(f"成功在当前目录下建立 {pdfSavePath} 文件夹\n")
+        os.makedirs(projConfig.pdfSavePath)
+        print(f"成功在当前目录下建立 {projConfig.pdfSavePath} 文件夹\n")
 
     printSpliter()
-    print(f"{pdfSavePath} 目录检查完成，开始执行主程序\n")
+    print(f"{projConfig.pdfSavePath} 目录检查完成，开始执行主程序\n")
 
     sleep(feedbacktime)
 
@@ -112,7 +129,7 @@ def printSpliter(length=25):
 
     printSpliter()
 
-    spiderpub(encoded_param, args.page_num, result_num)
+    spiderpub(encoded_param, args.pagenum, result_num)
 
     printSpliter()
     print("\n\n爬取搜索结果完成，开始执行单篇检索，耗时更久\n\n")
@@ -123,11 +140,11 @@ def printSpliter(length=25):
     print("\n\n爬取搜索结果完成，开始执行文献下载，耗时更久\n\n")
 
     # PDFHelper.PDFBatchDonwload(args.download_num)
-    PDFHelper.PDFBatchDownloadEntry(args.download_num)
+    PDFHelper.PDFBatchDownloadEntry(args.downloadnum)
 
     ExcelHelper.PD_To_excel(dbpath, override=True)
     print("爬取最终结果信息已经自动保存到excel表格中，文件名为%s" % ExcelHelper.tablename)
-    print(f"爬取的所有文献已经保存到{pdfSavePath}目录下")
+    print(f"爬取的所有文献已经保存到{projConfig.pdfSavePath}目录下")
     print("爬取程序已经执行完成，自动退出, 哈哈，no errors no warning")
 
     printSpliter()