Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Pubmed - replace deprecated extend_corpus #949

Merged
merged 2 commits into from
Feb 24, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 5 additions & 21 deletions orangecontrib/text/pubmed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,10 @@
from datetime import datetime

import numpy as np
from Bio import Entrez
from Bio import Medline
from Bio import Entrez, Medline
from Orange.data import DiscreteVariable, Domain, StringVariable, TimeVariable
from Orange.misc import environ


from Orange.data import StringVariable, DiscreteVariable, TimeVariable, Domain
from orangecontrib.text.corpus import Corpus

BASE_ENTRY_URL = 'http://www.ncbi.nlm.nih.gov/pubmed/?term='
Expand Down Expand Up @@ -185,7 +183,6 @@ def __init__(self, email, progress_callback=None, error_callback=None):
self.error_callback = error_callback
self.stop_signal = False

self.cache_path = None
cache_folder = os.path.join(environ.cache_dir(), 'pubmedcache')

if not os.path.exists(cache_folder):
Expand Down Expand Up @@ -323,7 +320,6 @@ def _retrieve_records(self, num_records,
`orangecontrib.text.corpus.Corpus`: The retrieved PubMed records
as a corpus.
"""
corpus = None
batch_size = min(self.MAX_BATCH_SIZE, num_records)
cached_data = [] # Later on, construct the corpus from this.
new_records = [] # Must download.
Expand Down Expand Up @@ -353,10 +349,8 @@ def _retrieve_records(self, num_records,
# Advance the callback accordingly.
self.progress_callback(int(cached_data_size/batch_size))

# Create a starting corpus.
corpus = _corpus_from_records(cached_data, includes_metadata)

# --- Retrieve missing/new ---
records = []
if len(new_records) > 0:
try:
post_handle = Entrez.epost('pubmed', id=','.join(new_records))
Expand Down Expand Up @@ -403,18 +397,8 @@ def _retrieve_records(self, num_records,
if self.progress_callback:
self.progress_callback()

if corpus is None:
corpus = _corpus_from_records(records, includes_metadata)
else: # Update the corpus.
time_var = corpus.domain[PUBMED_FIELD_DATE]
meta_values, class_values = _records_to_corpus_entries(
records,
includes_metadata=includes_metadata,
time_var=time_var,
)

corpus.extend_corpus(meta_values, class_values)

data = cached_data + records
corpus = _corpus_from_records(data, includes_metadata) if len(data) else None
return corpus

def download_records(self, terms=[], authors=[],
Expand Down
4 changes: 4 additions & 0 deletions orangecontrib/text/tests/data/pubmed-cache.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"read": {"RetMax": "533", "TranslationStack": [{"Term": "orchid[All fields]", "Field": "All fields", "Explode": "N", "Count": "1410"}, {"Term": "2011/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, {"Term": "2015/11/02[PDAT]", "Field": "PDAT", "Explode": "N", "Count": "0"}, "RANGE", "AND"], "IdList": ["26779201", "26713612", "26666122", "26603277", "26602351"], "RetStart": "0", "WebEnv": "NCID_1_13312974_165.112.9.37_9001_1459677505_1646131365_0MetA0_S_MegaStore_F_1", "TranslationSet": [], "QueryTranslation": "orchid[All fields] AND 2011/11/02[PDAT] : 2015/11/02[PDAT]", "Count": "533", "QueryKey": "1"},
"parse": [{"PMID": "26779201", "FAU": ["Li, Jia-Wei", "Zhang, Shi-Bao"], "TI": "Differences in the Responses of Photosystems I and II in Cymbidium sinense and C. tracyanum to Long-Term Chilling Stress.", "MH": null, "AB": "The susceptibility of photosystem I (PSI) and photosystem II (PSII) to chilling stress depends on plant species, and cyclic electron flow (CEF) plays an important role in photoprotection for some species under short stress periods. However, little is known about the responses of PSI and PSII to long-term chilling stress. We studied two orchid species-Cymbidium sinense and C. tracyanum- that differ in their capacity to adapt to low temperature, and exposed plants for 19 d to stress conditions that included 4 degrees C and a light intensity of 250 to 350 mumol photons m(-2) s(-1). Meanwhile, we investigated their dynamic variations in Chl fluorescence and P700 parameters. After exposure to 4 degrees C and 250 mumol photons m(-2) s(-1) for 6 h, PSI activity was maintained stable in both species, but stronger PSII photoinhibition was observed in C. sinense. During the long-term treatment, the maximum quantum yield of PSII was significantly reduced, with that decrease being greater in C. sinense. After 19 d of chilling treatment, the maximum photo-oxidizable P700 declined only slightly in C. tracyanum but dropped significantly in C. sinense. Linear electron flow was largely depressed during the long-term chilling treatment, especially in C. sinense. Meanwhile, C. tracyanum showed higher CEF activity than C. sinense. These results indicate that PSII is more sensitive to chilling-light stress than PSI in both species. The rate of PSII photodamage at chilling-light stress is higher in C. sinense than C. tracyanum, and CEF contributes to photoprotection for PSI and PSII under long-term chilling stress in C. tracyanum.", "DP": "2015"}, {"PMID": "26666122", "FAU": ["Dos Santos, Aline Borba", "do Nascimento, Fabio Santos"], "TI": "Cuticular Hydrocarbons of Orchid Bees Males: Interspecific and Chemotaxonomy Variation.", "MH": null, "AB": "Recent studies have investigated the composition of compounds that cover the cuticle in social insects, but few studies have focused on solitary bees. Cuticular hydrocarbons may provide a tool for chemotaxonomy, and perhaps they can be used as a complement to morphology and genetic characters in phylogenetic studies. Orchid bees (Tribe Euglossini) are a highly diverse group of Neotropical bees with more than 200 species. Here, the cuticular hydrocarbons of 17 species were identified and statistical analysis revealed 108 compounds, which allowed for the taxonomic classification according to the genera. The most significant compounds discriminating the four genera were (Z)-9-pentacosene, (Z,Z)-pentatriacontene-3, (Z)-9-tricosene, and (Z)-9-heptacosene. The analyses demonstrated the potential use of CHCs to identify different species.", "DP": "2015"}, {"PMID": "26713612", "FAU": ["Ospina-Torres, Rodulfo", "Montoya-Pfeiffer, Paula Maria", "Parra-H, Alejandro", "Solarte, Victor", "Tupac Otero, Joel"], "TI": "Interaction networks and the use of floral resources by male orchid bees (Hymenoptera: Apidae: Euglossini) in a primary rain forests of the Choco Region (Colombia).", "MH": ["Animals", "Bees/classification/*physiology", "Colombia", "*Ecosystem", "Male", "Orchidaceae/*classification", "*Pollination", "Population Density", "Rainforest"], "AB": "Orchid bees are important keystone pollinators from the Neotropics. With the aim to study the relationships between orchid bees and their nectar and aromatic host species, we made systematic samplings of males across two conservation areas in the biogeographic Choc6 Region of Colombia. We used chemical baits to collect 352 male bees during five months. The pollen attached to their bodies was extracted for palynological identification and to estimate interaction networks. The euglossine community consisted of at least 22 species including Eg. maculilabris, Eg. orellana, Eg. championi and Eg. ignita. The male bees were associated with 84 plants but depended on a small group of them (Peperomia spp. and Anthurium spp, as well as species of Solanaceae, Ericaceae and Malpighiaceae) which were widely distributed across the altitudinal gradient, and were available through the year. The resulting interaction networks revealed a typical nested pattern usually found in plant-pollinator interactions, with several rare bee and plant species interaction with a small group of generalist bees and plant species. Albeit, we found variation within networks related to species composition. Such variation may be a consequence of specific differences in plant flowering phenology.", "DP": "2015 Sep"}, {"PMID": "26603277", "FAU": ["Nielsen, Lasse Janniche", "Moller, Birger Lindberg"], "TI": "Scent emission profiles from Darwin's orchid--Angraecum sesquipedale: Investigation of the aldoxime metabolism using clustering analysis.", "MH": null, "AB": "The display of scent is crucial for plants in attracting pollinating insects to flowers and ensuring successful pollination and reproduction. The large number of aldoxime volatile species present in the scent of the Madagascan orchid Angraecum sesquipedale has been suggested to play a primary role in attracting the sphingid moth Xanthopan morgani praedicta. By solid phase micro-extraction (SPME) coupled with gas chromatography-mass spectrometry (GC-MS), we monitored the scent release from different flowers of a single orchid, day and night throughout the entire flowering period. In separate experiments, the diurnal release was monitored in 3h intervals and the tissue specific release from the different floral parts was tracked. Numerous novel compounds related to the aldoxime metabolism not previously detected in A. sesquipedale were identified and positioned into a proposed pathway for aldoxime metabolism. From the results, we hypothesize that (E/Z)-phenylacetaldoxime and its derivatives could be important attractants for the pollinating moth X. morgani praedicta. By applying an untargeted Partitioning Around Medoids (PAM) cluster analysis to the metabolite profiles in the scent, the proposed pathways for the formation of aldoximes were substantiated. With this study, we demonstrate the powerful utility of a bioinformatics tool to aid in the elucidation of the routes of formation for volatiles and provide a benchmark and guidelines for future detailed observations of hawkmoth pollination of Angraecum species, and in particular A. sesquipedale, in the wild.", "DP": "2015 Dec"}, {"PMID": "26602351", "FAU": ["Oliveira, R", "Pinto, C E", "Schlindwein, C"], "TI": "Two common species dominate the species-rich Euglossine bee fauna of an Atlantic Rainforest remnant in Pernambuco, Brazil.", "MH": null, "AB": "Nowadays, the northern part of the Atlantic Rainforest of Brazil is largely destroyed and forest remnants rarely exceed 100 ha. In a 118 ha forest fragment within a state nature reserve of Pernambuco (Reserva Ecologica Gurjau), we surveyed the orchid bee fauna (Apidae, Euglossini) using eight different scent baits to attract males. Once a month during one year, the bees were actively collected with entomological nets, from November 2002 to October 2003 by two collectors. We collected 2,908 orchid bee males belonging to 23 species, one of the highest richness values of the Northern Atlantic Rainforest. Bees of only two species, Euglossa carolina (50%) and Eulaema nigrita (25%), which occurred throughout the year, accounted for three quarter of the collected individuals. Both species are typical for open or disturbed areas. Rainforest remnants like those of Gurjau within the predominant sugar cane monocultures in the coastal plains of the northern Atlantic Rainforest play an important role in orchid bee conservation and maintenance of biodiversity.", "DP": "2015 Nov"}]
}
Loading