Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions DBHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from DataType import Publication
from DataType import SingleDocInfo, TempPMID
from LogHelper import print_error
from config import savetime
from config import projConfig


# 把一些关于sqlite3相关的操作抽象出来了,方便其他模块调用
Expand Down Expand Up @@ -173,7 +173,7 @@ def DBWriter(dbpath: str, sql: str, params: tuple = None) -> bool:

# 这个函数是用来保存文献打开页面获取到的单独的信息的
def DBSaveInfo(singleinfo: SingleDocInfo, dbpath: str):
tablename = 'pubmed%s' % savetime
tablename = 'pubmed%s' % projConfig.savetime

ret = False
try:
Expand Down
8 changes: 5 additions & 3 deletions ExcelHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,12 +6,13 @@
import pandas as pd

from LogHelper import print_error
from config import savetime, feedbacktime
from config import projConfig
feedbacktime = projConfig.feedbacktime


class ExcelHelper:
savepath: str = f'./pubmed-{savetime}.xlsx'
tablename: str = f'pubmed{savetime}'
savepath: str = f'./pubmed-{projConfig.savetime}.xlsx'
tablename: str = f'pubmed{projConfig.savetime}'
# 原始列名和新列名组成的字典
rename_dict = {
'id': '序号',
Expand Down Expand Up @@ -131,6 +132,7 @@ def PD_To_excel(cls, dbpath: str, override=False) -> None:
break
if 1 <= x <= len(table_list):
index = table_list[x - 1]
# todo
savetime = index[6:]
ExcelHelper.PD_To_excel(dbpath)
print("此次保存执行完成,下一个循环")
Expand Down
8 changes: 4 additions & 4 deletions GetSearchResult.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from DataType import ArticleFreeType, SingleSearchData
from LogHelper import print_error
from WebHelper import WebHelper
from config import savetime
from config import projConfig


def parseSearchHtml(html: str) -> Optional[List[SingleSearchData]]:
Expand Down Expand Up @@ -119,7 +119,7 @@ def SaveSearchData(datalist: List[SingleSearchData], dbpath: str) -> None:
datalist (List[SingleSearchData]): List of parsed search data.
dbpath (str): Path to the SQLite database.
"""
tablename = f'pubmed{savetime}'
tablename = f'pubmed{projConfig.savetime}'
for singleSearchData in datalist:
try:
sql = f"""
Expand Down Expand Up @@ -182,8 +182,8 @@ def spiderpub(parameter: str, page_limit: int, resultNum: int) -> None:
datalist.extend(SingleSearchPageData)

dbpath = 'pubmedsql'
tablename = f'pubmed{savetime}'
txtname = f"pubmed{savetime}.txt"
tablename = f'pubmed{projConfig.savetime}'
txtname = f"pubmed{projConfig.savetime}.txt"

try:
DBHelper.DBCreater(dbpath)
Expand Down
36 changes: 18 additions & 18 deletions PDFHelper.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from DBHelper import DBWriter, DBFetchAllFreePMC
from DataType import TempPMID
from LogHelper import print_error
from config import savetime, pdfSavePath
from config import projConfig


# 把一些关于PDF相关的操作抽象出来了,方便其他模块调用
Expand Down Expand Up @@ -45,29 +45,29 @@ def handle_error(e):
print_error("Error occured: %s" % e)

@staticmethod
def IsFileExist(path: str) -> bool:
def __IsFileExist(path: str) -> bool:
return Path(path).exists()

@classmethod
def IsPDFExist(cls, tempid) -> bool:
savepath = cls.GetPDFSavePath(tempid)
return PDFHelper.IsFileExist(savepath)
def __IsPDFExist(cls, tempid) -> bool:
savepath = cls.__GetPDFSavePath(tempid)
return PDFHelper.__IsFileExist(savepath)

@classmethod
def GetPDFFileName(cls, tempid: TempPMID) -> str:
def __GetPDFFileName(cls, tempid: TempPMID) -> str:
return re.sub(r'[< >/\\|:"*?]', ' ', tempid.doctitle)

@classmethod
def GetPDFSavePath(cls, tempid: TempPMID) -> str:
return f"{pdfSavePath}/{cls.GetPDFFileName(tempid)}.pdf"
def __GetPDFSavePath(cls, tempid: TempPMID) -> str:
return f"{projConfig.pdfSavePath}/{cls.__GetPDFFileName(tempid)}.pdf"

@classmethod
def PDFBatchDownloadEntry(cls, limit):
"""
异步批量处理的pdf下载函数
感觉写得稀烂啊
"""
tablename = 'pubmed%s' % savetime
tablename = 'pubmed%s' % projConfig.savetime
count = 0
dbpath = 'pubmedsql'
PMID_list: [TempPMID] = DBFetchAllFreePMC(dbpath, tablename)
Expand All @@ -77,10 +77,10 @@ def PDFBatchDownloadEntry(cls, limit):
# 过滤掉已经存在于本地的文献
PMCID_list = []
for item in temp_list:
if cls.IsPDFExist(item):
if cls.__IsPDFExist(item):
# 存在于目录当中直接更新就行了
cls.PDFUpdateDB(item, cls.GetPDFSavePath(item), dbpath)
print(f"PDF: {cls.GetPDFFileName(item)} 在保存目录当中已存在,跳过下载")
cls.PDFUpdateDB(item, cls.__GetPDFSavePath(item), dbpath)
print(f"PDF: {cls.__GetPDFFileName(item)} 在保存目录当中已存在,跳过下载")
else:
PMCID_list.append(item)

Expand All @@ -95,7 +95,7 @@ def PDFBatchDownloadEntry(cls, limit):
status = PDFHelper.PDFSaveFile(item[1], item[0])
if (status == True):
# 将pdf文件名称以及存储位置等相关信息添加到sqlite数据库当中
PDFHelper.PDFUpdateDB(item[0], cls.GetPDFSavePath(item[0]), dbpath)
PDFHelper.PDFUpdateDB(item[0], cls.__GetPDFSavePath(item[0]), dbpath)
else:
print_error("保存pdf文件发生错误,自动跳过该文献PMCID为 %s" % item[0].PMCID)
continue
Expand Down Expand Up @@ -153,26 +153,26 @@ def PDFSaveFile(cls, html, tempid: TempPMID) -> bool:
将pdf保存到本地文件的功能
暂时还不确定能否支持异步,就先用同步版本了
"""
tablename = 'pubmed%s' % savetime
tablename = 'pubmed%s' % projConfig.savetime
# pdf = html.decode("utf-8") # 使用Unicode8对二进制网页进行解码,直接写入文件就不需要解码了

try:
articleName = cls.GetPDFFileName(tempid)
articleName = cls.__GetPDFFileName(tempid)
# 需要注意的是文件命名中不能含有以上特殊符号,只能去除掉
savepath = "%s/%s.pdf" % (pdfSavePath, articleName)
savepath = "%s/%s.pdf" % (projConfig.pdfSavePath, articleName)
file = open(savepath, 'wb')
print("open success")
file.write(html)
file.close()
print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % pdfSavePath)
print("pdf文件写入成功,文件ID为 %s" % tempid.PMCID, "保存路径为%s" % projConfig.pdfSavePath)
return True
except:
print_error(f"pdf文件写入失败, 文件ID为 {tempid.PMCID}, 检查路径")
return False

@classmethod
def PDFUpdateDB(cls, tempid: TempPMID, savepath: str, dbpath: str) -> bool:
tablename = 'pubmed%s' % savetime
tablename = 'pubmed%s' % projConfig.savetime
try:
writeSql = " UPDATE %s SET savepath = ? WHERE PMCID =?" % tablename
param = (savepath, tempid.PMCID)
Expand Down
59 changes: 48 additions & 11 deletions config.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,54 @@
# -*- coding: utf-8 -*-
import shutil
import time

savetime = time.strftime("%Y%m%d%H%M%S")
feedbacktime: float = 1.5
pdfSavePath = "./document/pub"

# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息,默认50啦
batchsize: int = 50


class ProjectInfo:
VersionInfo: str = "1.2.0"
VersionInfo: str = "1.2.1"
ProjectName: str = "Pubmedsoso"
LastUpdate: str = "20241202"
LastUpdate: str = "20241212"
AuthorName: str = "hiddenblue"
# 此处的变量是main一次运行中需要多次调用的全局变量

@classmethod
def printProjectInfo(cls):
# 动态计算字段名称的最大宽度
max_width = max(len(key) for key in cls.__dict__.keys() if not key.startswith("__"))

# 获取终端宽度
terminal_width = shutil.get_terminal_size().columns

print("")
# 打印居中的欢迎信息
welcome_message = "欢迎使用 Pubmedsoso 文献检索工具"
print("")

print(welcome_message.center(terminal_width))

# 打印分隔线
print("=" * terminal_width)



# 打印项目信息
for key, value in cls.__dict__.items():
if not key.startswith("__") and not callable(value) and not isinstance(value, classmethod):
print(f"{key:<{max_width}}: {value}".center(terminal_width))

# 打印分隔线
print("=" * terminal_width)

class GlobalConfig:
def __init__(self):
self.savetime: str = time.strftime("%Y%m%d%H%M%S")
self.feedbacktime: float = 1.5
self.pdfSavePath: str = "./document/pub"

# 这个参数用于geteachinfo决定一次性通过异步下载多少页面的信息,默认50啦
self.batchsize: int = 50


# 下面这句在从其他模块导入这个变量执行就会自动执行,并且是一个全局共享的状态
projConfig = GlobalConfig()

if __name__ == "__main__":

ProjectInfo.printProjectInfo()
6 changes: 4 additions & 2 deletions geteachinfo.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from ExcelHelper import ExcelHelper
from LogHelper import print_error
from WebHelper import WebHelper
from config import savetime, batchsize
from config import projConfig

batchsize = projConfig.batchsize


def parse_abstract(
Expand Down Expand Up @@ -135,7 +137,7 @@ def parse_single_info(html_etree: etree.Element):


def geteachinfo(dbpath):
tablename = 'pubmed%s' % savetime
tablename = 'pubmed%s' % projConfig.savetime

PMID_list = DBFetchAllPMID(dbpath, tablename)
if PMID_list == None:
Expand Down
55 changes: 36 additions & 19 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@
from GetSearchResult import spiderpub
from PDFHelper import PDFHelper
from WebHelper import WebHelper
from config import ProjectInfo, feedbacktime, pdfSavePath
from config import ProjectInfo, projConfig

feedbacktime = projConfig.feedbacktime


def printSpliter(length=25):
Expand All @@ -22,9 +24,9 @@ def printSpliter(length=25):
# 命令行参数解析
parser = argparse.ArgumentParser(
description="pubmedsoso is a python program for crawler article information and download pdf file",
usage="python main.py keyword ")
usage="python main.py keyword")

parser.add_argument('--version', '-v', action='version',
parser.add_argument('-v', '--version', action='version',
version=f'\nCurrent the {ProjectInfo.ProjectName}\n\n version: {ProjectInfo.VersionInfo}\n' +
f'Last updated date: {ProjectInfo.LastUpdate} \n' +
f'Author: {ProjectInfo.AuthorName} \n',
Expand All @@ -40,33 +42,48 @@ def printSpliter(length=25):
parser.add_argument("keyword", type=str,
help='specify the keywords to search pubmed\n For example "headache"')

parser.add_argument("--page_num", "-n", type=int,
help='add --number or -n to specify the page number you wanna to crawl'
'For example --number 10. Default number is 10',
parser.add_argument("-n", "--pagenum", type=int, metavar='',
help='add --pagenum or -n to specify the page number of info you wanna to crawl'
'For example --pagenum 10. Default number is 10',
default=10)

parser.add_argument("--year", "-y", type=int,
parser.add_argument("-y", "--year", type=int, metavar='',
help='add --year or -y to specify year scale you would to search'
'For example --year 10. The Default is Not set',
default=None)

parser.add_argument("--download_num", "-d", type=int,
help='add --download_num or -d to specify the doc number you wanna to download'
parser.add_argument("-d", "--downloadnum", type=int, metavar='',
help='add --downloadnum or -d to specify the number of pdf you wanna to download'
'For example -d 10. Default number is 10',
default=10)

parser.add_argument("-D", "--directory", type=str, metavar='',
help='add --directory or -D specify the save path of pdf file'
'For example, -D ./output. Default path is ./document/pub'
'you can overrider the default path in config.py',
default='./document/pub')
####################################################################################################

args = parser.parse_args()

# print the hello info
ProjectInfo.printProjectInfo()
print("\n")

# check the directory variable. the path variable from cli is preferred.
# the default pdf saving directory path is from config.py which is './document/pub'
if args.directory is not None:
projConfig.pdfSavePath = args.directory

if args.keyword.isspace() or args.keyword.isnumeric():
print("pubmedsoso search keyword error\n")
sleep(feedbacktime)

print("\n欢迎使用Pubmedsoso 文件检索工具\n\n")

print(f"当前使用的命令行参数 {args.__dict__}\n")

print(f"Current commandline parameters: {args.__dict__}\n")
print(
f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.page_num}, 年份:{args.year}, 文献下载数量:{args.download_num}\n")
f"当前使用的命令行参数 搜索关键词: \"{args.keyword}\", 文献信息检索数量: {args.pagenum}, 年份:{args.year}, 文献下载数量: {args.downloadnum}, 下载文献的存储目录: {projConfig.pdfSavePath}\n")
try:
result_num = WebHelper.GetSearchResultNum(args.keyword)
except Exception as err:
Expand All @@ -90,14 +107,14 @@ def printSpliter(length=25):
printSpliter()
sleep(0.5)

if os.path.exists(pdfSavePath):
if os.path.exists(projConfig.pdfSavePath):
print("文件储存目录检查正常,可以储存文件\n")
else:
os.makedirs(pdfSavePath)
print(f"成功在当前目录下建立 {pdfSavePath} 文件夹\n")
os.makedirs(projConfig.pdfSavePath)
print(f"成功在当前目录下建立 {projConfig.pdfSavePath} 文件夹\n")

printSpliter()
print(f"{pdfSavePath} 目录检查完成,开始执行主程序\n")
print(f"{projConfig.pdfSavePath} 目录检查完成,开始执行主程序\n")

sleep(feedbacktime)

Expand All @@ -112,7 +129,7 @@ def printSpliter(length=25):

printSpliter()

spiderpub(encoded_param, args.page_num, result_num)
spiderpub(encoded_param, args.pagenum, result_num)

printSpliter()
print("\n\n爬取搜索结果完成,开始执行单篇检索,耗时更久\n\n")
Expand All @@ -123,11 +140,11 @@ def printSpliter(length=25):
print("\n\n爬取搜索结果完成,开始执行文献下载,耗时更久\n\n")

# PDFHelper.PDFBatchDonwload(args.download_num)
PDFHelper.PDFBatchDownloadEntry(args.download_num)
PDFHelper.PDFBatchDownloadEntry(args.downloadnum)

ExcelHelper.PD_To_excel(dbpath, override=True)
print("爬取最终结果信息已经自动保存到excel表格中,文件名为%s" % ExcelHelper.tablename)
print(f"爬取的所有文献已经保存到{pdfSavePath}目录下")
print(f"爬取的所有文献已经保存到{projConfig.pdfSavePath}目录下")
print("爬取程序已经执行完成,自动退出, 哈哈,no errors no warning")

printSpliter()
Expand Down
Loading