55from time import sleep
66
77from GetEachInfo import geteachinfo
8- from GetSearchResult import spiderpub
8+ from GetSearchResult import searchEntry
99from config import ProjectInfo , projConfig
1010from utils .Commandline import MedCli
1111from utils .ExcelHelper import ExcelHelper
1212from utils .LogHelper import medLog , MedLogger
1313from utils .PDFHelper import PDFHelper
1414from utils .WebHelper import WebHelper
1515
16+ # 从config.py当中导入一些配置信息,后续可能会被cli的参数override
1617feedbacktime = projConfig .feedbacktime
18+ dbpath = projConfig .dbpath
1719
1820
1921def printSpliter (length = 25 ):
@@ -81,6 +83,8 @@ def printSpliter(length=25):
8183 help = 'add --output or -o to specify output path of pdf file '
8284 ' For example, -o pmc7447651.pdf. Default is PMCxxxxxx.pdf' ,
8385 default = 'None' )
86+
87+ # 调试 清理 相关的参数
8488
8589 parser .add_argument ("-l" , "--loglevel" , metavar = '' ,
8690 choices = ('debug' , 'info' , 'warning' , 'error' , 'critical' ),
@@ -93,30 +97,46 @@ def printSpliter(length=25):
9397 help = 'add --yes or -Y to skip the confirmation process and start searching directly' ,
9498 default = False )
9599
100+ parser .add_argument ("-c" , "--clean" , action = "store_true" ,
101+ help = 'clean the output directory and sqlite history table' ,
102+ default = False )
103+
96104 # todo
97105 # add mutual exclusive group for some args
98106
99107 ####################################################################################################
100108
101109 args = parser .parse_args ()
102110
111+ # 打印项目信息
103112 # print the hello info
104113 ProjectInfo .printProjectInfo ()
105114 print ("\n " )
106115
116+ # 设置日志级别
107117 # alter the log level according to the cli args
108118 # cli first, overriding the config.py
109119 if args .loglevel is not None :
110120 loglevel = MedCli .parseLogLevel (args .loglevel )
111121 projConfig .loglevel = loglevel
112122 MedLogger .setTerminalLogLevel (medLog , loglevel )
123+
124+ # 清理历史记录
125+ if args .clean is True :
126+ # 默认的excel和txt输出目录应该是在当前文件夹
127+ MedCli .cleanHistory (directory = "./" , dbpath = dbpath , skip = args .yes )
128+ medLog .info ("Exiting" )
129+ sleep (feedbacktime )
130+ sys .exit ()
113131
132+ # 单篇处理模式
114133 if args .keyword is None and (args .pmcid , args .pmid ):
115134 # 关键词为空,进入单篇处理模式
116135 MedCli .SingleArticleMode (pmcid = args .pmcid , pmid = args .pmid )
117136 else :
118137 pass
119138
139+ # 设置保存目录
120140 # check the directory variable. the path variable from cli is preferred.
121141 # the default pdf saving directory path is from config.py which is './document/pub'
122142 if args .directory is not None :
@@ -127,44 +147,48 @@ def printSpliter(length=25):
127147 medLog .error ("Please check your config.py and cli parameter." "The program will exit." )
128148 sys .exit ()
129149
150+ # 检查关键词
130151 if args .keyword .isspace () or args .keyword .isnumeric ():
131152 medLog .error ("pubmedsoso search keyword error\n " )
132153 medLog .error ("the program will exit." )
133154 sleep (feedbacktime )
134155
135156 ######################################################################################################
136157
158+ # 输出当前参数
137159 medLog .info (f"Current commandline parameters: { args .__dict__ } \n " )
138160 medLog .info (
139161 f"当前使用的命令行参数 搜索关键词: \" { args .keyword } \" , 文献信息检索数量: { args .pagenum } , 年份:{ args .year } , 文献下载数量: { args .downloadnum } , 下载文献的存储目录: { projConfig .pdfSavePath } \n " )
162+
163+ # 获取搜索结果数量
140164 try :
141165 result_num = WebHelper .GetSearchResultNum (keyword = args .keyword , year = args .year )
142166 except Exception as err :
143167 raise
144168
145169 medLog .info ("当前关键词在pubmed检索到的相关结果数量为: %s\n " % result_num )
146-
170+
171+ # 确认开始执行程序
147172 # add --yes parameter to skip the confirmation
148- if args .Yes is True :
173+ if args .yes is True :
149174 pass
150175 else :
151-
152176 medLog .info ("是否要根据以上参数开始执行程序?y or n\n " )
153177 startFlag = input ()
154- if startFlag == 'y' or startFlag == 'Y' or startFlag == 'Yes' :
155- pass
156- if startFlag in ["n" , "N" , "No" , "no" ]:
178+
179+ if startFlag not in ['y' , 'Y' , 'Yes' ]:
157180 medLog .critical ("程序终止执行\n \n " )
158181 sleep (feedbacktime * 0.5 )
159182 sys .exit ()
160-
183+
161184 ######################################################################################################
162185
163186 printSpliter ()
164187 medLog .info ("程序已运行,开始检查数据储存目录\n " )
165188 printSpliter ()
166189 sleep (0.5 )
167190
191+ # 检查保存目录
168192 if os .path .exists (projConfig .pdfSavePath ):
169193 medLog .info ("文件储存目录检查正常,可以储存文件\n " )
170194 else :
@@ -176,30 +200,33 @@ def printSpliter(length=25):
176200
177201 sleep (feedbacktime )
178202
179- dbpath = "./pubmedsql"
180203 # ?term=cell%2Bblood&filter=datesearch.y_1&size=20
181204
182205 # 根据上面输入的关键词初始化生成url参数
206+ # 解析URL参数
183207 ParamDict = WebHelper .parseParamDcit (keyword = args .keyword , year = args .year )
184208 encoded_param = WebHelper .encodeParam (ParamDict )
185209
186- # 从此处开始爬取数据
187210
188211 printSpliter ()
189-
190- spiderpub (encoded_param , args .pagenum , result_num )
212+
213+ # 爬取搜索结果
214+ searchEntry (encoded_param , args .pagenum , result_num )
191215
192216 printSpliter ()
193217 medLog .info ("\n \n 爬取搜索结果完成,开始执行单篇检索,耗时更久\n \n " )
194218
219+ # 获取每篇文章信息
195220 geteachinfo (dbpath )
196-
221+
197222 printSpliter ()
198223 medLog .info ("\n \n 爬取搜索结果完成,开始执行文献下载,耗时更久\n \n " )
199224
225+ # 下载PDF
200226 # PDFHelper.PDFBatchDonwload(args.download_num)
201227 PDFHelper .PDFBatchDownloadEntry (args .downloadnum )
202-
228+
229+ # 生成Excel表格
203230 ExcelHelper .PD_To_excel (dbpath , override = True )
204231 medLog .info ("爬取最终结果信息已经自动保存到excel表格中,文件名为%s" % ExcelHelper .tablename )
205232 medLog .info (f"爬取的所有文献已经保存到{ projConfig .pdfSavePath } 目录下" )
0 commit comments