Skip to content

Commit 9f1c772

Browse files
authored
Merge pull request #125 from pepkit/dev
Release 0.12.5
2 parents 109b10e + 133bcb7 commit 9f1c772

File tree

9 files changed

+99
-55
lines changed

9 files changed

+99
-55
lines changed

docs/changelog.md

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# Changelog
22

3+
## [0.12.5] -- 2023-11-29
4+
- Fixed bug, where description was not populated in PEP
5+
36
## [0.12.4] -- 2023-08-01
47
- Fixed SRA convert
58
- Added how to convert SRA

geofetch/__init__.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,14 @@
11
""" Package-level data """
22
import logmuse
3+
import coloredlogs
34

4-
from geofetch.geofetch import *
5-
from geofetch.finder import *
5+
from geofetch.geofetch import Geofetcher
6+
from geofetch.finder import Finder
67
from geofetch._version import __version__
78

89

910
__author__ = ["Oleksandr Khoroshevskyi", "Vince Reuter", "Nathan Sheffield"]
10-
__all__ = ["Finder", "Geofetcher"]
11+
__all__ = ["Finder", "Geofetcher", "__version__"]
1112

1213
_LOGGER = logmuse.init_logger("geofetch")
1314
coloredlogs.install(

geofetch/_version.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = "0.12.4"
1+
__version__ = "0.12.5"

geofetch/finder.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -127,7 +127,7 @@ def _run_search_query(url: str) -> list:
127127
"""
128128
x = requests.get(url)
129129
if x.status_code != 200:
130-
_LOGGER.error(f"Request status != 200. Error. Check your request")
130+
_LOGGER.error("Request status != 200. Error. Check your request")
131131
return []
132132
try:
133133
x_result = xmltodict.parse(x.text)["eSearchResult"]

geofetch/geofetch.py

Lines changed: 53 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,24 @@
1717
import pandas as pd
1818

1919
from geofetch.cli import _parse_cmdl
20-
from geofetch.const import *
20+
from geofetch.const import (
21+
GSE_PATTERN,
22+
SAMPLE_SUPP_METADATA_FILE,
23+
EXP_SUPP_METADATA_FILE,
24+
NEW_GENOME_COL_NAME,
25+
FILE_RAW_NAME_SAMPLE_PATTERN,
26+
FILE_RAW_NAME_SUBSAMPLE_PATTERN,
27+
CONFIG_RAW_TEMPLATE_NAME,
28+
CONFIG_SRA_TEMPLATE,
29+
CONFIG_PROCESSED_TEMPLATE_NAME,
30+
NUM_RETRIES,
31+
SER_SUPP_FILE_PATTERN,
32+
SUPP_FILE_PATTERN,
33+
PROJECT_PATTERN,
34+
NCBI_EFETCH,
35+
NCBI_ESEARCH,
36+
EXPERIMENT_PATTERN,
37+
)
2138
from geofetch.utils import (
2239
Accession,
2340
build_prefetch_command,
@@ -480,8 +497,8 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
480497
file_gse_content, gsm_metadata, file_sra
481498
)
482499
if not srp_list_result:
483-
_LOGGER.info(f"No SRP data, continuing ....")
484-
_LOGGER.warning(f"No raw pep will be created! ....")
500+
_LOGGER.info("No SRP data, continuing ....")
501+
_LOGGER.warning("No raw pep will be created! ....")
485502
# delete current acc if no raw data was found
486503
# del metadata_dict[acc_GSE]
487504
pass
@@ -498,7 +515,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
498515
_LOGGER.info(f"Getting SRR: {run} in ({acc_GSE})")
499516
self._download_raw_data(run)
500517
else:
501-
_LOGGER.info(f"Dry run, no data will be downloaded")
518+
_LOGGER.info("Dry run, no data will be downloaded")
502519

503520
# save one project
504521
if self.acc_anno and nkeys > 1:
@@ -517,7 +534,7 @@ def fetch_all(self, input: str, name: str = None) -> Union[NoReturn, peppy.Proje
517534

518535
# Logging cleaning process:
519536
if self.discard_soft:
520-
_LOGGER.info(f"Cleaning soft files ...")
537+
_LOGGER.info("Cleaning soft files ...")
521538
clean_soft_files(self.metadata_root_full)
522539

523540
#######################################################################################
@@ -878,7 +895,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
878895
if element_is_list:
879896
for n_elem in range(len(metadata_list)):
880897
try:
881-
if type(metadata_list[n_elem][dict_key]) is not list:
898+
if not isinstance(metadata_list[n_elem][dict_key], list):
882899
metadata_list[n_elem][dict_key] = [
883900
metadata_list[n_elem][dict_key]
884901
]
@@ -930,7 +947,7 @@ def _expand_metadata_list_item(self, metadata_list: list, dict_key: str):
930947
metadata_list[n_elem][dict_key] = this_string
931948
else:
932949
del metadata_list[n_elem][dict_key]
933-
except KeyError as err:
950+
except KeyError:
934951
# _LOGGER.warning(
935952
# f"expand_metadata_list: Key Error: {err}, continuing ..."
936953
# )
@@ -980,6 +997,7 @@ def _write_processed_annotation(
980997
) -> Union[NoReturn, peppy.Project]:
981998
"""
982999
Save annotation file by providing list of dictionaries with files metadata
1000+
9831001
:param list processed_metadata: list of dictionaries with files metadata
9841002
:param str file_annotation_path: the path to the metadata file that has to be saved
9851003
:param just_object: True, if you want to get peppy object without saving file
@@ -1046,13 +1064,14 @@ def _write_processed_annotation(
10461064
proj = peppy.Project().from_pandas(pd_value, config=conf)
10471065
proj_exp_data = conf.get("experiment_metadata")
10481066
if proj_exp_data:
1049-
proj["description"] = proj_exp_data.get("series_title")
1067+
proj.description = proj_exp_data.get("series_title")
10501068
return proj
10511069

10521070
@staticmethod
10531071
def _find_genome(metadata_list: list) -> list:
10541072
"""
10551073
Create new genome column by searching joining few columns
1074+
10561075
:param metadata_list: list with metadata dict
10571076
:return: list with metadata dict where genome column was added
10581077
"""
@@ -1080,6 +1099,7 @@ def _write_raw_annotation_new(
10801099
"""
10811100
Combine individual accessions into project-level annotations, and writing
10821101
individual accession files (if requested)
1102+
10831103
:param name: Name of the run, project, or acc --> will influence name of the folder where project will be created
10841104
:param metadata_dict: dictionary of sample annotations
10851105
:param subannot_dict: dictionary of subsample annotations
@@ -1128,7 +1148,7 @@ def _write_raw_annotation_new(
11281148
f"subsample_table: {os.path.basename(proj_root_subsample)}"
11291149
)
11301150
else:
1131-
subanot_path_yaml = f""
1151+
subanot_path_yaml = ""
11321152

11331153
template = self._create_config_raw(
11341154
proj_meta, proj_root_sample, subanot_path_yaml, gse_meta_dict
@@ -1166,7 +1186,7 @@ def _write_raw_annotation_new(
11661186
proj = peppy.Project().from_pandas(meta_df, sub_meta_df, conf)
11671187
proj_exp_data = conf.get("experiment_metadata")
11681188
if proj_exp_data:
1169-
proj["description"] = proj_exp_data.get("series_title")
1189+
proj.description = proj_exp_data.get("series_title")
11701190
return proj
11711191

11721192
def _create_config_processed(
@@ -1177,6 +1197,7 @@ def _create_config_processed(
11771197
) -> str:
11781198
"""
11791199
Compose and generate config file content
1200+
11801201
:param file_annotation_path: root to the annotation file
11811202
:param proj_meta: common metadata that has to added to config file
11821203
:param meta_in_series:
@@ -1218,6 +1239,7 @@ def _create_config_raw(
12181239
):
12191240
"""
12201241
Compose and generate config file content for raw data
1242+
12211243
:param proj_meta: root to the annotation file
12221244
:param proj_root_sample: path to sampletable file
12231245
:param subanot_path_yaml: path to subannotation file
@@ -1275,6 +1297,7 @@ def _check_sample_name_standard(metadata_dict: dict) -> dict:
12751297
"""
12761298
Standardize sample name and checking if it exists
12771299
(This function is used for raw data)
1300+
12781301
:param metadata_dict: metadata dict
12791302
:return: metadata dict with standardize sample names
12801303
"""
@@ -1300,14 +1323,16 @@ def _separate_common_meta(
13001323
) -> tuple:
13011324
"""
13021325
Separate experiment(project) metadata from sample metadata
1326+
13031327
:param list or dict meta_list: list of dictionaries of samples
13041328
:param int max_len: threshold of the length of the common value that can be stored in the sample table
13051329
:param int del_limit: threshold of the length of the common value that have to be deleted
13061330
:param int attr_limit_truncate: max length of the attribute in the sample csv
13071331
:return set: Return is a set of list, where 1 list (or dict) is
1308-
list of samples metadata dictionaries and 2: list of common samples metadata
1309-
dictionaries that are linked to the project.
1332+
list of samples metadata dictionaries and 2: list of common samples metadata
1333+
dictionaries that are linked to the project.
13101334
"""
1335+
13111336
# check if meta_list is dict and converting it to list
13121337
input_is_dict = False
13131338
if isinstance(meta_list, dict):
@@ -1401,6 +1426,7 @@ def _download_SRA_file(self, run_name: str):
14011426
def _sra_to_bam_conversion_sam_dump(self, bam_file: str, run_name: str) -> NoReturn:
14021427
"""
14031428
Convert SRA file to BAM file by using samtools function "sam-dump"
1429+
14041430
:param str bam_file: path to BAM file that has to be created
14051431
:param str run_name: SRR number of the SRA file that has to be converted
14061432
"""
@@ -1509,7 +1535,7 @@ def _download_file(
15091535
full_filepath = os.path.join(data_folder, new_name)
15101536

15111537
if not os.path.exists(full_filepath):
1512-
_LOGGER.info(f"\033[38;5;242m") # set color to gray
1538+
_LOGGER.info("\033[38;5;242m") # set color to gray
15131539
# if dir does not exist:
15141540
if not os.path.exists(data_folder):
15151541
os.makedirs(data_folder)
@@ -1518,7 +1544,7 @@ def _download_file(
15181544
)
15191545
_LOGGER.info(f"\033[38;5;242m{ret}\033[0m")
15201546
time.sleep(sleep_after)
1521-
_LOGGER.info(f"\033[0m") # Reset to default terminal color
1547+
_LOGGER.info("\033[0m") # Reset to default terminal color
15221548
else:
15231549
_LOGGER.info(f"\033[38;5;242mFile {full_filepath} exists.\033[0m")
15241550

@@ -1545,7 +1571,7 @@ def _get_list_of_processed_files(
15451571
pl = parse_SOFT_line(line)
15461572
file_url = pl[list(pl.keys())[0]].rstrip()
15471573
filename = os.path.basename(file_url)
1548-
_LOGGER.debug(f"Processed GSE file found: %s" % str(file_url))
1574+
_LOGGER.debug(f"Processed GSE file found: {str(file_url)}")
15491575

15501576
# search for tar file:
15511577
if tar_re.search(filename):
@@ -1574,7 +1600,7 @@ def _get_list_of_processed_files(
15741600
)
15751601

15761602
else:
1577-
raise Exception(f"error in requesting tar_files_list")
1603+
raise Exception("error in requesting tar_files_list")
15781604
else:
15791605
_LOGGER.info(f"Found previous GSM file: {filelist_path}")
15801606
filelist_obj = open(filelist_path, "r")
@@ -1610,9 +1636,8 @@ def _get_list_of_processed_files(
16101636
):
16111637
meta_processed_samples[nb].update(pl)
16121638
else:
1613-
if (
1614-
type(meta_processed_samples[nb][element_keys])
1615-
is not list
1639+
if not isinstance(
1640+
meta_processed_samples[nb][element_keys], list
16161641
):
16171642
meta_processed_samples[nb][element_keys] = [
16181643
meta_processed_samples[nb][element_keys]
@@ -1631,7 +1656,7 @@ def _get_list_of_processed_files(
16311656
pl = parse_SOFT_line(line_gsm)
16321657
file_url_gsm = pl[list(pl.keys())[0]].rstrip()
16331658
_LOGGER.debug(
1634-
f"Processed GSM file found: %s" % str(file_url_gsm)
1659+
f"Processed GSM file found: {str(file_url_gsm)}"
16351660
)
16361661
if file_url_gsm != "NONE":
16371662
meta_processed_samples[nb]["files"].append(file_url_gsm)
@@ -1643,8 +1668,7 @@ def _get_list_of_processed_files(
16431668
meta_processed_samples = _separate_file_url(meta_processed_samples)
16441669

16451670
_LOGGER.info(
1646-
f"\nTotal number of processed SAMPLES files found is: "
1647-
f"%s" % str(len(meta_processed_samples))
1671+
f"\nTotal number of processed SAMPLES files found is: {str(len(meta_processed_samples))}"
16481672
)
16491673

16501674
# expand meta_processed_samples with information about type and size
@@ -1677,21 +1701,21 @@ def _get_list_of_processed_files(
16771701
if bl_key not in meta_processed_series.keys():
16781702
meta_processed_series.update(bl)
16791703
else:
1680-
if type(meta_processed_series[bl_key]) is not list:
1704+
if not isinstance(meta_processed_series[bl_key], list):
16811705
meta_processed_series[bl_key] = [meta_processed_series[bl_key]]
16821706
meta_processed_series[bl_key].append(bl_value)
16831707
else:
16841708
meta_processed_series[bl_key].append(bl_value)
16851709
except IndexError as ind_err:
16861710
_LOGGER.debug(
1687-
f"IndexError in adding value to meta_processed_series: %s" % ind_err
1711+
f"IndexError in adding value to meta_processed_series: {ind_err}"
16881712
)
16891713

16901714
meta_processed_series = _separate_list_of_files(meta_processed_series)
16911715
meta_processed_series = _separate_file_url(meta_processed_series)
16921716
_LOGGER.info(
16931717
f"Total number of processed SERIES files found is: "
1694-
f"%s" % str(len(meta_processed_series))
1718+
f"{str(len(meta_processed_series))}"
16951719
)
16961720
if self.filter_re:
16971721
meta_processed_series = self._run_filter(meta_processed_series)
@@ -1778,6 +1802,7 @@ def _download_processed_file(self, file_url: str, data_folder: str) -> bool:
17781802
def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
17791803
"""
17801804
Parse out the SRA project identifier from the GSE file
1805+
17811806
:param list file_gse_content: list of content of file_sde_content
17821807
:param dict gsm_metadata: dict of GSM metadata
17831808
:param str file_sra: full path to SRA.csv metafile that has to be downloaded
@@ -1805,7 +1830,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
18051830
acc_SRP = list(gsm_metadata.keys())[0]
18061831
_LOGGER.warning(
18071832
"But the GSM has an SRX number; instead of an "
1808-
"SRP, using SRX identifier for this sample: " + acc_SRP
1833+
f"SRP, using SRX identifier for this sample: {acc_SRP}"
18091834
)
18101835
except TypeError:
18111836
_LOGGER.warning("Error in gsm_metadata")
@@ -1839,7 +1864,7 @@ def _get_SRA_meta(self, file_gse_content: list, gsm_metadata, file_sra=None):
18391864
return []
18401865
else:
18411866
# open existing annotation
1842-
_LOGGER.info(f"Found SRA metadata, opening..")
1867+
_LOGGER.info("Found SRA metadata, opening..")
18431868
with open(file_sra, "r") as m_file:
18441869
reader = csv.reader(m_file)
18451870
file_list = []
@@ -1869,7 +1894,7 @@ def _get_SRP_list(self, srp_number: str) -> list:
18691894
:return: list of dicts of SRRs
18701895
"""
18711896
if not srp_number:
1872-
_LOGGER.info(f"No srp number in this accession found")
1897+
_LOGGER.info("No srp number in this accession found")
18731898
return []
18741899
_LOGGER.info(f"Downloading {srp_number} sra metadata")
18751900
ncbi_esearch = NCBI_ESEARCH.format(SRP_NUMBER=srp_number)

geofetch/sraconvert.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -144,12 +144,14 @@ def main():
144144
# for paired-end data, and only *_1.fastq for single-end data.
145145
outfile = "{fq_prefix}_1.fastq.gz".format(fq_prefix=fq_prefix)
146146
cmd = "fasterq-dump {data_source} -O {outfolder}".format(
147-
data_source=infile, outfolder=args.fqfolder, nofail=True
147+
data_source=infile,
148+
outfolder=args.fqfolder,
148149
)
149150
elif args.format == "bam":
150151
outfile = os.path.join(args.bamfolder, args.srr[i] + ".bam")
151152
cmd = "sam-dump -u {data_source} | samtools view -bS - > {outfile}".format(
152-
data_source=infile, outfile=outfile, nofail=True
153+
data_source=infile,
154+
outfile=outfile,
153155
)
154156
else:
155157
raise KeyError("Unknown format: {}".format(args.format))
@@ -160,7 +162,7 @@ def main():
160162
pm.info("Already completed files: {}".format(failed_files))
161163
try:
162164
failed_files.remove(infile)
163-
except:
165+
except Exception:
164166
pass
165167

166168
elif args.mode == "delete_bam":

0 commit comments

Comments
 (0)