Skip to content

Commit 8be9b24

Browse files
committed
Feature: cli support -c clean mode
1 parent 3ecaf7b commit 8be9b24

File tree

8 files changed

+96
-26
lines changed

8 files changed

+96
-26
lines changed

.gitignore

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ __pycache__/
77
document/
88
*.xls
99
test.py
10-
pubmedsql
10+
pubmedsql.db
1111

1212

1313
# C extensions

README.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,7 @@ optional arguments:
9090
--output -o add --output filename to appoint name of pdf file
9191
--loglevel -l set the console log level, e.g debug
9292
--yes -Y add --yes or -Y to skip the confirmation process before search
93+
--clean -c clean the output directory and sqlite history table
9394
----
9495

9596
*If you are familiar with IDEs, you can run `main.py` in Python environments such as `pycharm` or `vscode`.*

README_CN.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ optional arguments:
9797
9898
--loglevel -l set the console log level, e.g -l debug
9999
--yes -Y add --yes to skip the confirmation process before searching
100+
--clean -c clean the output directory and sqlite history table
100101
----
101102

102103
_如果你熟悉IDE的话,可以在pycharm或者vscode等python环境下运行main.py_

config.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,9 @@ def __init__(self):
99
self.savetime: str = time.strftime("%Y%m%d%H%M%S")
1010
self.feedbacktime: float = 1.5
1111
self.pdfSavePath: str = "./document/pub"
12+
self.dbpath: str = "./pubmedsql.db"
1213

13-
# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息,默认50啦
14+
# 这个参数用于GetEachInfo决定一次性通过异步下载多少页面的信息,默认50啦
1415
self.InfoBatchSize: int = 50
1516
self.PDF_BatchSize: int = 5
1617

main.py

Lines changed: 41 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -5,15 +5,17 @@
55
from time import sleep
66

77
from GetEachInfo import geteachinfo
8-
from GetSearchResult import spiderpub
8+
from GetSearchResult import searchEntry
99
from config import ProjectInfo, projConfig
1010
from utils.Commandline import MedCli
1111
from utils.ExcelHelper import ExcelHelper
1212
from utils.LogHelper import medLog, MedLogger
1313
from utils.PDFHelper import PDFHelper
1414
from utils.WebHelper import WebHelper
1515

16+
# 从config.py当中导入一些配置信息,后续可能会被cli的参数override
1617
feedbacktime = projConfig.feedbacktime
18+
dbpath = projConfig.dbpath
1719

1820

1921
def printSpliter(length=25):
@@ -81,6 +83,8 @@ def printSpliter(length=25):
8183
help='add --output or -o to specify output path of pdf file '
8284
' For example, -o pmc7447651.pdf. Default is PMCxxxxxx.pdf',
8385
default='None')
86+
87+
# 调试 清理 相关的参数
8488

8589
parser.add_argument("-l", "--loglevel", metavar='',
8690
choices=('debug', 'info', 'warning', 'error', 'critical'),
@@ -93,30 +97,46 @@ def printSpliter(length=25):
9397
help='add --yes or -Y to skip the confirmation process and start searching directly',
9498
default=False)
9599

100+
parser.add_argument("-c", "--clean", action="store_true",
101+
help='clean the output directory and sqlite history table',
102+
default=False)
103+
96104
# todo
97105
# add mutual exclusive group for some args
98106

99107
####################################################################################################
100108

101109
args = parser.parse_args()
102110

111+
# 打印项目信息
103112
# print the hello info
104113
ProjectInfo.printProjectInfo()
105114
print("\n")
106115

116+
# 设置日志级别
107117
# alter the log level according to the cli args
108118
# cli first, overriding the config.py
109119
if args.loglevel is not None:
110120
loglevel = MedCli.parseLogLevel(args.loglevel)
111121
projConfig.loglevel = loglevel
112122
MedLogger.setTerminalLogLevel(medLog, loglevel)
123+
124+
# 清理历史记录
125+
if args.clean is True:
126+
# 默认的excel和txt输出目录应该是在当前文件夹
127+
MedCli.cleanHistory(directory="./", dbpath=dbpath, skip=args.yes)
128+
medLog.info("Exiting")
129+
sleep(feedbacktime)
130+
sys.exit()
113131

132+
# 单篇处理模式
114133
if args.keyword is None and (args.pmcid, args.pmid):
115134
# 关键词为空,进入单篇处理模式
116135
MedCli.SingleArticleMode(pmcid=args.pmcid, pmid=args.pmid)
117136
else:
118137
pass
119138

139+
# 设置保存目录
120140
# check the directory variable. the path variable from cli is preferred.
121141
# the default pdf saving directory path is from config.py which is './document/pub'
122142
if args.directory is not None:
@@ -127,44 +147,48 @@ def printSpliter(length=25):
127147
medLog.error("Please check your config.py and cli parameter." "The program will exit.")
128148
sys.exit()
129149

150+
# 检查关键词
130151
if args.keyword.isspace() or args.keyword.isnumeric():
131152
medLog.error("pubmedsoso search keyword error\n")
132153
medLog.error("the program will exit.")
133154
sleep(feedbacktime)
134155

135156
######################################################################################################
136157

158+
# 输出当前参数
137159
medLog.info(f"Current commandline parameters: {args.__dict__}\n")
138160
medLog.info(
139161
f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.pagenum}, 年份:{args.year}, 文献下载数量: {args.downloadnum}, 下载文献的存储目录: {projConfig.pdfSavePath}\n")
162+
163+
# 获取搜索结果数量
140164
try:
141165
result_num = WebHelper.GetSearchResultNum(keyword=args.keyword, year=args.year)
142166
except Exception as err:
143167
raise
144168

145169
medLog.info("当前关键词在pubmed检索到的相关结果数量为: %s\n" % result_num)
146-
170+
171+
# 确认开始执行程序
147172
# add --yes parameter to skip the confirmation
148-
if args.Yes is True:
173+
if args.yes is True:
149174
pass
150175
else:
151-
152176
medLog.info("是否要根据以上参数开始执行程序?y or n\n")
153177
startFlag = input()
154-
if startFlag == 'y' or startFlag == 'Y' or startFlag == 'Yes':
155-
pass
156-
if startFlag in ["n", "N", "No", "no"]:
178+
179+
if startFlag not in ['y', 'Y', 'Yes']:
157180
medLog.critical("程序终止执行\n\n")
158181
sleep(feedbacktime * 0.5)
159182
sys.exit()
160-
183+
161184
######################################################################################################
162185

163186
printSpliter()
164187
medLog.info("程序已运行,开始检查数据储存目录\n")
165188
printSpliter()
166189
sleep(0.5)
167190

191+
# 检查保存目录
168192
if os.path.exists(projConfig.pdfSavePath):
169193
medLog.info("文件储存目录检查正常,可以储存文件\n")
170194
else:
@@ -176,30 +200,33 @@ def printSpliter(length=25):
176200

177201
sleep(feedbacktime)
178202

179-
dbpath = "./pubmedsql"
180203
# ?term=cell%2Bblood&filter=datesearch.y_1&size=20
181204

182205
# 根据上面输入的关键词初始化生成url参数
206+
# 解析URL参数
183207
ParamDict = WebHelper.parseParamDcit(keyword=args.keyword, year=args.year)
184208
encoded_param = WebHelper.encodeParam(ParamDict)
185209

186-
# 从此处开始爬取数据
187210

188211
printSpliter()
189-
190-
spiderpub(encoded_param, args.pagenum, result_num)
212+
213+
# 爬取搜索结果
214+
searchEntry(encoded_param, args.pagenum, result_num)
191215

192216
printSpliter()
193217
medLog.info("\n\n爬取搜索结果完成,开始执行单篇检索,耗时更久\n\n")
194218

219+
# 获取每篇文章信息
195220
geteachinfo(dbpath)
196-
221+
197222
printSpliter()
198223
medLog.info("\n\n爬取搜索结果完成,开始执行文献下载,耗时更久\n\n")
199224

225+
# 下载PDF
200226
# PDFHelper.PDFBatchDonwload(args.download_num)
201227
PDFHelper.PDFBatchDownloadEntry(args.downloadnum)
202-
228+
229+
# 生成Excel表格
203230
ExcelHelper.PD_To_excel(dbpath, override=True)
204231
medLog.info("爬取最终结果信息已经自动保存到excel表格中,文件名为%s" % ExcelHelper.tablename)
205232
medLog.info(f"爬取的所有文献已经保存到{projConfig.pdfSavePath}目录下")

utils/Commandline.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,13 @@
11
import logging
22
import sys
3+
from time import sleep
34

5+
from utils.Clean import clean_files, clean_sqlite
6+
from config import projConfig
47
from utils.LogHelper import medLog
58

9+
feedbacktime = projConfig.feedbacktime
10+
611

712
class MedCli:
813

@@ -39,3 +44,31 @@ def SingleArticleMode(**kwargs):
3944

4045
medLog.warning("The program is exiting.\n")
4146
sys.exit(0)
47+
48+
@staticmethod
49+
def cleanHistory(directory: str, dbpath: str, **kwargs):
50+
51+
medLog.warning("The clean.py is up")
52+
medLog.info("The target directory is \"%s\"" % directory)
53+
medLog.info("The target database path is \"%s\"" % dbpath)
54+
sleep(feedbacktime)
55+
56+
if kwargs.get('skip', None) is not None and kwargs.get('skip') is True:
57+
# skip the confirmation process when -Y is enabled
58+
pass
59+
else:
60+
medLog.info("是否要根据以上参数执行清理程序?y or n\n")
61+
startFlag = input()
62+
63+
if startFlag not in ['y', 'Y', 'Yes']:
64+
medLog.critical("程序终止执行\n\n")
65+
sleep(feedbacktime * 0.5)
66+
sys.exit()
67+
68+
# 清理文件
69+
clean_files(directory)
70+
# 清理数据库当中的旧表
71+
clean_sqlite(dbpath)
72+
# 运行主要命令
73+
# run_main_command()
74+
medLog.warning("The clean.py is down")

utils/ExcelHelper.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
class ExcelHelper:
1515
savepath: str = f'./pubmed-{projConfig.savetime}.xlsx'
1616
tablename: str = f'pubmed{projConfig.savetime}'
17+
dbpath = projConfig.dbpath
1718
# 原始列名和新列名组成的字典
1819
rename_dict = {
1920
'id': '序号',
@@ -106,13 +107,13 @@ def PD_To_excel(cls, dbpath: str, override=False) -> None:
106107
except Exception as e:
107108
medLog.error(f"\n爬取数据库信息保存到Excel失败: {e}\n")
108109

109-
@staticmethod
110-
def local_export():
110+
@classmethod
111+
def local_export(cls):
111112
# todo
112113
# 将excel导出功能整合到cli当中
113114
import DBHelper
114115

115-
dbpath: str = 'pubmedsql'
116+
dbpath: str = cls.dbpath
116117
table_list: list = DBHelper.DBTableFinder(dbpath)
117118
if not table_list:
118119
medLog.critical("目标数据库不存在或者内容为空,请检查数据库,即将退出")

utils/PDFHelper.py

Lines changed: 13 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,13 @@
2222
# https://www.ncbi.nlm.nih.gov/pmc/articles/PMC9034016/pdf/main.pdf
2323

2424
class PDFHelper:
25+
"""
26+
处理PDF下载 保存相关逻辑的代码
27+
主要都是classmethod或者staticmethod
28+
29+
"""
30+
31+
# 类变量
2532
baseurl = "http://www.ncbi.nlm.nih.gov/"
2633
# 没有采用https是因为听说https的审查会增加延时
2734
headers = {
@@ -38,9 +45,8 @@ class PDFHelper:
3845
'sec-fetch-user': '?1',
3946
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 Edg/131.0.0.0',
4047
}
41-
42-
def __init__(self, dbpath):
43-
self.dbpath = dbpath
48+
dbpath = projConfig.dbpath
49+
tablename = 'pubmed%s' % projConfig.savetime
4450

4551
@staticmethod
4652
def handle_error(e):
@@ -69,8 +75,8 @@ def PDFBatchDownloadEntry(cls, limit):
6975
异步批量处理的pdf下载函数
7076
感觉写得稀烂啊
7177
"""
72-
tablename = 'pubmed%s' % projConfig.savetime
73-
dbpath = 'pubmedsql'
78+
tablename = cls.tablename
79+
dbpath = cls.dbpath
7480
# 注意这个列表的数据类型,和名称并不是相符的
7581
# 这个返回的结果是有免费全文的,包括 FreeArticle 和 FreePMCArticle 两类
7682
free_article_list: [TempPMID] = DBFetchAllFreePMC(dbpath, tablename)
@@ -196,10 +202,10 @@ def FileSave(cls, content: bytes, savepath: str) -> bool:
196202
medLog.info("open success")
197203
file.write(content)
198204
file.close()
199-
medLog.info("文件写入成功", "保存路径为%s" % savepath)
205+
medLog.info("文件写入成功, 保存路径为%s" % savepath)
200206
return True
201207
except Exception as e:
202-
medLog.error("文件写入失败", "保存路径为%s" % savepath)
208+
medLog.error("文件写入失败, 保存路径为%s" % savepath)
203209
medLog.error(e)
204210
return False
205211

0 commit comments

Comments
 (0)