Skip to content

Commit cc1b451

Browse files
authored
Merge pull request #8 from hiddenblue/dev
origin/dev
2 parents a02b40a + a994644 commit cc1b451

File tree

7 files changed

+117
-59
lines changed

7 files changed

+117
-59
lines changed

DBHelper.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
from DataType import Publication
55
from DataType import SingleDocInfo, TempPMID
66
from LogHelper import print_error
7-
from config import savetime
7+
from config import projConfig
88

99

1010
# 把一些关于sqlite3相关的操作抽象出来了,方便其他模块调用
@@ -173,7 +173,7 @@ def DBWriter(dbpath: str, sql: str, params: tuple = None) -> bool:
173173

174174
# 这个函数是用来保存文献打开页面获取到的单独的信息的
175175
def DBSaveInfo(singleinfo: SingleDocInfo, dbpath: str):
176-
tablename = 'pubmed%s' % savetime
176+
tablename = 'pubmed%s' % projConfig.savetime
177177

178178
ret = False
179179
try:

ExcelHelper.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -6,12 +6,13 @@
66
import pandas as pd
77

88
from LogHelper import print_error
9-
from config import savetime, feedbacktime
9+
from config import projConfig
10+
feedbacktime = projConfig.feedbacktime
1011

1112

1213
class ExcelHelper:
13-
savepath: str = f'./pubmed-{savetime}.xlsx'
14-
tablename: str = f'pubmed{savetime}'
14+
savepath: str = f'./pubmed-{projConfig.savetime}.xlsx'
15+
tablename: str = f'pubmed{projConfig.savetime}'
1516
# 原始列名和新列名组成的字典
1617
rename_dict = {
1718
'id': '序号',
@@ -131,6 +132,7 @@ def PD_To_excel(cls, dbpath: str, override=False) -> None:
131132
break
132133
if 1 <= x <= len(table_list):
133134
index = table_list[x - 1]
135+
# todo
134136
savetime = index[6:]
135137
ExcelHelper.PD_To_excel(dbpath)
136138
print("此次保存执行完成,下一个循环")

GetSearchResult.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from DataType import ArticleFreeType, SingleSearchData
1111
from LogHelper import print_error
1212
from WebHelper import WebHelper
13-
from config import savetime
13+
from config import projConfig
1414

1515

1616
def parseSearchHtml(html: str) -> Optional[List[SingleSearchData]]:
@@ -119,7 +119,7 @@ def SaveSearchData(datalist: List[SingleSearchData], dbpath: str) -> None:
119119
datalist (List[SingleSearchData]): List of parsed search data.
120120
dbpath (str): Path to the SQLite database.
121121
"""
122-
tablename = f'pubmed{savetime}'
122+
tablename = f'pubmed{projConfig.savetime}'
123123
for singleSearchData in datalist:
124124
try:
125125
sql = f"""
@@ -182,8 +182,8 @@ def spiderpub(parameter: str, page_limit: int, resultNum: int) -> None:
182182
datalist.extend(SingleSearchPageData)
183183

184184
dbpath = 'pubmedsql'
185-
tablename = f'pubmed{savetime}'
186-
txtname = f"pubmed{savetime}.txt"
185+
tablename = f'pubmed{projConfig.savetime}'
186+
txtname = f"pubmed{projConfig.savetime}.txt"
187187

188188
try:
189189
DBHelper.DBCreater(dbpath)

PDFHelper.py

Lines changed: 18 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from DBHelper import DBWriter, DBFetchAllFreePMC
1212
from DataType import TempPMID
1313
from LogHelper import print_error
14-
from config import savetime, pdfSavePath
14+
from config import projConfig
1515

1616

1717
# 把一些关于PDF相关的操作抽象出来了,方便其他模块调用
@@ -45,29 +45,29 @@ def handle_error(e):
4545
print_error("Error occured: %s" % e)
4646

4747
@staticmethod
48-
def IsFileExist(path: str) -> bool:
48+
def __IsFileExist(path: str) -> bool:
4949
return Path(path).exists()
5050

5151
@classmethod
52-
def IsPDFExist(cls, tempid) -> bool:
53-
savepath = cls.GetPDFSavePath(tempid)
54-
return PDFHelper.IsFileExist(savepath)
52+
def __IsPDFExist(cls, tempid) -> bool:
53+
savepath = cls.__GetPDFSavePath(tempid)
54+
return PDFHelper.__IsFileExist(savepath)
5555

5656
@classmethod
57-
def GetPDFFileName(cls, tempid: TempPMID) -> str:
57+
def __GetPDFFileName(cls, tempid: TempPMID) -> str:
5858
return re.sub(r'[< >/\\|:"*?]', ' ', tempid.doctitle)
5959

6060
@classmethod
61-
def GetPDFSavePath(cls, tempid: TempPMID) -> str:
62-
return f"{pdfSavePath}/{cls.GetPDFFileName(tempid)}.pdf"
61+
def __GetPDFSavePath(cls, tempid: TempPMID) -> str:
62+
return f"{projConfig.pdfSavePath}/{cls.__GetPDFFileName(tempid)}.pdf"
6363

6464
@classmethod
6565
def PDFBatchDownloadEntry(cls, limit):
6666
"""
6767
异步批量处理的pdf下载函数
6868
感觉写得稀烂啊
6969
"""
70-
tablename = 'pubmed%s' % savetime
70+
tablename = 'pubmed%s' % projConfig.savetime
7171
count = 0
7272
dbpath = 'pubmedsql'
7373
PMID_list: [TempPMID] = DBFetchAllFreePMC(dbpath, tablename)
@@ -77,10 +77,10 @@ def PDFBatchDownloadEntry(cls, limit):
7777
# 过滤掉已经存在于本地的文献
7878
PMCID_list = []
7979
for item in temp_list:
80-
if cls.IsPDFExist(item):
80+
if cls.__IsPDFExist(item):
8181
# 存在于目录当中直接更新就行了
82-
cls.PDFUpdateDB(item, cls.GetPDFSavePath(item), dbpath)
83-
print(f"PDF: {cls.GetPDFFileName(item)} 在保存目录当中已存在,跳过下载")
82+
cls.PDFUpdateDB(item, cls.__GetPDFSavePath(item), dbpath)
83+
print(f"PDF: {cls.__GetPDFFileName(item)} 在保存目录当中已存在,跳过下载")
8484
else:
8585
PMCID_list.append(item)
8686

@@ -95,7 +95,7 @@ def PDFBatchDownloadEntry(cls, limit):
9595
status = PDFHelper.PDFSaveFile(item[1], item[0])
9696
if (status == True):
9797
# 将pdf文件名称以及存储位置等相关信息添加到sqlite数据库当中
98-
PDFHelper.PDFUpdateDB(item[0], cls.GetPDFSavePath(item[0]), dbpath)
98+
PDFHelper.PDFUpdateDB(item[0], cls.__GetPDFSavePath(item[0]), dbpath)
9999
else:
100100
print_error("保存pdf文件发生错误,自动跳过该文献PMCID为 %s" % item[0].PMCID)
101101
continue
@@ -153,26 +153,26 @@ def PDFSaveFile(cls, html, tempid: TempPMID) -> bool:
153153
将pdf保存到本地文件的功能
154154
暂时还不确定能否支持异步,就先用同步版本了
155155
"""
156-
tablename = 'pubmed%s' % savetime
156+
tablename = 'pubmed%s' % projConfig.savetime
157157
# pdf = html.decode("utf-8") # 使用Unicode8对二进制网页进行解码,直接写入文件就不需要解码了
158158

159159
try:
160-
articleName = cls.GetPDFFileName(tempid)
160+
articleName = cls.__GetPDFFileName(tempid)
161161
# 需要注意的是文件命名中不能含有以上特殊符号,只能去除掉
162-
savepath = "%s/%s.pdf" % (pdfSavePath, articleName)
162+
savepath = "%s/%s.pdf" % (projConfig.pdfSavePath, articleName)
163163
file = open(savepath, 'wb')
164164
print("open success")
165165
file.write(html)
166166
file.close()
167-
print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % pdfSavePath)
167+
print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % projConfig.pdfSavePath)
168168
return True
169169
except:
170170
print_error(f"pdf文件写入失败, 文件ID为 {tempid.PMCID}, 检查路径")
171171
return False
172172

173173
@classmethod
174174
def PDFUpdateDB(cls, tempid: TempPMID, savepath: str, dbpath: str) -> bool:
175-
tablename = 'pubmed%s' % savetime
175+
tablename = 'pubmed%s' % projConfig.savetime
176176
try:
177177
writeSql = " UPDATE %s SET savepath = ? WHERE PMCID =?" % tablename
178178
param = (savepath, tempid.PMCID)

config.py

Lines changed: 48 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,17 +1,54 @@
11
# -*- coding: utf-8 -*-
2+
import shutil
23
import time
34

4-
savetime = time.strftime("%Y%m%d%H%M%S")
5-
feedbacktime: float = 1.5
6-
pdfSavePath = "./document/pub"
7-
8-
# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息,默认50啦
9-
batchsize: int = 50
10-
11-
125
class ProjectInfo:
13-
VersionInfo: str = "1.2.0"
6+
VersionInfo: str = "1.2.1"
147
ProjectName: str = "Pubmedsoso"
15-
LastUpdate: str = "20241202"
8+
LastUpdate: str = "20241212"
169
AuthorName: str = "hiddenblue"
17-
# 此处的变量是main一次运行中需要多次调用的全局变量
10+
11+
@classmethod
12+
def printProjectInfo(cls):
13+
# 动态计算字段名称的最大宽度
14+
max_width = max(len(key) for key in cls.__dict__.keys() if not key.startswith("__"))
15+
16+
# 获取终端宽度
17+
terminal_width = shutil.get_terminal_size().columns
18+
19+
print("")
20+
# 打印居中的欢迎信息
21+
welcome_message = "欢迎使用 Pubmedsoso 文献检索工具"
22+
print("")
23+
24+
print(welcome_message.center(terminal_width))
25+
26+
# 打印分隔线
27+
print("=" * terminal_width)
28+
29+
30+
31+
# 打印项目信息
32+
for key, value in cls.__dict__.items():
33+
if not key.startswith("__") and not callable(value) and not isinstance(value, classmethod):
34+
print(f"{key:<{max_width}}: {value}".center(terminal_width))
35+
36+
# 打印分隔线
37+
print("=" * terminal_width)
38+
39+
class GlobalConfig:
40+
def __init__(self):
41+
self.savetime: str = time.strftime("%Y%m%d%H%M%S")
42+
self.feedbacktime: float = 1.5
43+
self.pdfSavePath: str = "./document/pub"
44+
45+
# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息,默认50啦
46+
self.batchsize: int = 50
47+
48+
49+
# 下面这句在从其他模块导入这个变量执行就会自动执行,并且是一个全局共享的状态
50+
projConfig = GlobalConfig()
51+
52+
if __name__ == "__main__":
53+
54+
ProjectInfo.printProjectInfo()

geteachinfo.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,9 @@
1313
from ExcelHelper import ExcelHelper
1414
from LogHelper import print_error
1515
from WebHelper import WebHelper
16-
from config import savetime, batchsize
16+
from config import projConfig
17+
18+
batchsize = projConfig.batchsize
1719

1820

1921
def parse_abstract(
@@ -135,7 +137,7 @@ def parse_single_info(html_etree: etree.Element):
135137

136138

137139
def geteachinfo(dbpath):
138-
tablename = 'pubmed%s' % savetime
140+
tablename = 'pubmed%s' % projConfig.savetime
139141

140142
PMID_list = DBFetchAllPMID(dbpath, tablename)
141143
if PMID_list == None:

main.py

Lines changed: 36 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
from GetSearchResult import spiderpub
1111
from PDFHelper import PDFHelper
1212
from WebHelper import WebHelper
13-
from config import ProjectInfo, feedbacktime, pdfSavePath
13+
from config import ProjectInfo, projConfig
14+
15+
feedbacktime = projConfig.feedbacktime
1416

1517

1618
def printSpliter(length=25):
@@ -22,9 +24,9 @@ def printSpliter(length=25):
2224
# 命令行参数解析
2325
parser = argparse.ArgumentParser(
2426
description="pubmedsoso is a python program for crawler article information and download pdf file",
25-
usage="python main.py keyword ")
27+
usage="python main.py keyword")
2628

27-
parser.add_argument('--version', '-v', action='version',
29+
parser.add_argument('-v', '--version', action='version',
2830
version=f'\nCurrent the {ProjectInfo.ProjectName}\n\n version: {ProjectInfo.VersionInfo}\n' +
2931
f'Last updated date: {ProjectInfo.LastUpdate} \n' +
3032
f'Author: {ProjectInfo.AuthorName} \n',
@@ -40,33 +42,48 @@ def printSpliter(length=25):
4042
parser.add_argument("keyword", type=str,
4143
help='specify the keywords to search pubmed\n For example "headache"')
4244

43-
parser.add_argument("--page_num", "-n", type=int,
44-
help='add --number or -n to specify the page number you wanna to crawl'
45-
'For example --number 10. Default number is 10',
45+
parser.add_argument("-n", "--pagenum", type=int, metavar='',
46+
help='add --pagenum or -n to specify the page number of info you wanna to crawl'
47+
'For example --pagenum 10. Default number is 10',
4648
default=10)
4749

48-
parser.add_argument("--year", "-y", type=int,
50+
parser.add_argument("-y", "--year", type=int, metavar='',
4951
help='add --year or -y to specify year scale you would to search'
5052
'For example --year 10. The Default is Not set',
5153
default=None)
5254

53-
parser.add_argument("--download_num", "-d", type=int,
54-
help='add --download_num or -d to specify the doc number you wanna to download'
55+
parser.add_argument("-d", "--downloadnum", type=int, metavar='',
56+
help='add --downloadnum or -d to specify the number of pdf you wanna to download'
5557
'For example -d 10. Default number is 10',
5658
default=10)
59+
60+
parser.add_argument("-D", "--directory", type=str, metavar='',
61+
help='add --directory or -D specify the save path of pdf file'
62+
'For example, -D ./output. Default path is ./document/pub'
63+
'you can overrider the default path in config.py',
64+
default='./document/pub')
5765
####################################################################################################
5866

5967
args = parser.parse_args()
6068

69+
# print the hello info
70+
ProjectInfo.printProjectInfo()
71+
print("\n")
72+
73+
# check the directory variable. the path variable from cli is preferred.
74+
# the default pdf saving directory path is from config.py which is './document/pub'
75+
if args.directory is not None:
76+
projConfig.pdfSavePath = args.directory
77+
6178
if args.keyword.isspace() or args.keyword.isnumeric():
6279
print("pubmedsoso search keyword error\n")
6380
sleep(feedbacktime)
6481

65-
print("\n欢迎使用Pubmedsoso 文件检索工具\n\n")
6682

67-
print(f"当前使用的命令行参数 {args.__dict__}\n")
83+
84+
print(f"Current commandline parameters: {args.__dict__}\n")
6885
print(
69-
f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.page_num}, 年份:{args.year}, 文献下载数量:{args.download_num}\n")
86+
f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.pagenum}, 年份:{args.year}, 文献下载数量: {args.downloadnum}, 下载文献的存储目录: {projConfig.pdfSavePath}\n")
7087
try:
7188
result_num = WebHelper.GetSearchResultNum(args.keyword)
7289
except Exception as err:
@@ -90,14 +107,14 @@ def printSpliter(length=25):
90107
printSpliter()
91108
sleep(0.5)
92109

93-
if os.path.exists(pdfSavePath):
110+
if os.path.exists(projConfig.pdfSavePath):
94111
print("文件储存目录检查正常,可以储存文件\n")
95112
else:
96-
os.makedirs(pdfSavePath)
97-
print(f"成功在当前目录下建立 {pdfSavePath} 文件夹\n")
113+
os.makedirs(projConfig.pdfSavePath)
114+
print(f"成功在当前目录下建立 {projConfig.pdfSavePath} 文件夹\n")
98115

99116
printSpliter()
100-
print(f"{pdfSavePath} 目录检查完成,开始执行主程序\n")
117+
print(f"{projConfig.pdfSavePath} 目录检查完成,开始执行主程序\n")
101118

102119
sleep(feedbacktime)
103120

@@ -112,7 +129,7 @@ def printSpliter(length=25):
112129

113130
printSpliter()
114131

115-
spiderpub(encoded_param, args.page_num, result_num)
132+
spiderpub(encoded_param, args.pagenum, result_num)
116133

117134
printSpliter()
118135
print("\n\n爬取搜索结果完成,开始执行单篇检索,耗时更久\n\n")
@@ -123,11 +140,11 @@ def printSpliter(length=25):
123140
print("\n\n爬取搜索结果完成,开始执行文献下载,耗时更久\n\n")
124141

125142
# PDFHelper.PDFBatchDonwload(args.download_num)
126-
PDFHelper.PDFBatchDownloadEntry(args.download_num)
143+
PDFHelper.PDFBatchDownloadEntry(args.downloadnum)
127144

128145
ExcelHelper.PD_To_excel(dbpath, override=True)
129146
print("爬取最终结果信息已经自动保存到excel表格中,文件名为%s" % ExcelHelper.tablename)
130-
print(f"爬取的所有文献已经保存到{pdfSavePath}目录下")
147+
print(f"爬取的所有文献已经保存到{projConfig.pdfSavePath}目录下")
131148
print("爬取程序已经执行完成,自动退出, 哈哈,no errors no warning")
132149

133150
printSpliter()

0 commit comments

Comments
 (0)