13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242 | class PubMedInterface:
def __init__(
self,
email="rmelvin@uabmc.edu",
max_results=50,
streamlit_context=False,
max_retries=3,
delay_seconds=5,
):
self.email = email
self.max_results = max_results
self.streamlit_context = streamlit_context
self.max_retries = max_retries
self.delay_seconds = delay_seconds
Entrez.email = email
def _format_authors(self):
"""
The function `format_authors` takes a list of strings representing authors and returns a
formatted string of their last names followed by initials, following APA rules.
Args:
author_list: A list of strings, where each string represents an author in the format "Last Name Initials".
Returns:
a formatted string of authors' names in the format "Last Name, Initials," following APA rules.
"""
formatted_authors = []
num_authors = len(self._authors)
if num_authors <= 20:
# Normal case, just list all authors
for author in self._authors:
*last_name, initials = author.rsplit(" ", 1)
last_name = " ".join(last_name)
formatted_authors.append(f"{last_name}, {initials}.")
return ", ".join(formatted_authors)
else:
# APA rule for > 20 authors: first 19, ellipsis, last author
for author in self._authors[:19]:
*last_name, initials = author.rsplit(" ", 1)
last_name = " ".join(last_name)
formatted_authors.append(f"{last_name}, {initials}.")
last_author = self._authors[-1]
last_author_name, last_author_initials = last_author.rsplit(" ", 1)
formatted_authors.append("…")
formatted_authors.append(f"{last_author_name}, {last_author_initials}.")
return ", ".join(formatted_authors)
def _format_apa_citation(self):
"""
The function `format_apa_citation` takes in an article and its ID and returns a formatted APA
citation string.
Args:
article: The `article` parameter is a dictionary that contains information about a specific
article. It should have the following structure:
article_id: The article_id parameter is the unique identifier for the article. It is used to
include the PMID (PubMed ID) in the APA citation format.
Returns:
a formatted APA citation for an article, including the authors, publication year, title, journal,
volume, pages, and PMID (PubMed ID).
"""
try:
authors = self._format_authors()
except KeyError:
authors = ""
return f"{authors} ({self._pub_month}). {self._title} {self._journal}, {self._volume}, {self._pages}. PMID: {self._pmid}"
def _extract_record_data(self, record):
# Extract the desired information
self._title = record.get("TI", "No title available")
self._keywords = record.get("OT", []) # OT might not be present in all records
# try to use mesh headers if keywords not present
if not self._keywords:
self._keywords = record.get("MH", [])
self._abstract = record.get("AB", "No abstract available")
self._pmid = record.get("PMID", "No PMID available")
self._pub_month = record.get("DP", "No date available")
self._authors = record.get("AU", [])
self._journal = record.get("JT", "No jounral name available")
self._volume = record.get("VI", "No volume available")
self._pages = record.get("PG", "No pages available")
def search_pubmed_articles(self, query):
"""
The function `search_pubmed_articles` takes a PubMed search string, an email address, and an
optional maximum number of results, and returns a list of PubMed article IDs that match the search
criteria.
Args:
query: The search query string for PubMed.
email: The email address associated with your NCBI account.
max_results: Optional; maximum number of results to retrieve (default 10).
streamlit_context: Optional; a boolean flag indicating whether the code is running within a Streamlit app (default False).
max_retries: Optional; the maximum number of retry attempts if an HTTP error occurs (default 3).
delay_seconds: Optional; the number of seconds to wait between retry attempts (default 5).
Returns:
A list of PubMed article IDs that match the search criteria.
"""
for attempt in range(self.max_retries):
try:
handle = Entrez.esearch(
db="pubmed", term=query, sort="relevance", retmax=self.max_results
)
record = Entrez.read(handle)
handle.close()
return record["IdList"]
except HTTPError as e:
error_message = f"PubMed didn't respond (attempt {attempt + 1}/{self.max_retries}): {e}"
if attempt < self.max_retries:
wait_message = f"Waiting {self.delay_seconds} seconds before trying PubMed again..."
print(error_message)
print(wait_message)
if self.streamlit_context:
st.warning(error_message)
st.warning(wait_message)
time.sleep(self.delay_seconds)
else:
final_message = "Giving up on PubMed. It was an issue on their end. You may want to try again later."
print(error_message)
print(final_message)
if self.streamlit_context:
st.warning(error_message)
st.warning(final_message)
return []
def fetch_article_details(self, pubmed_ids):
"""
The function fetches article details from PubMed using the provided PubMed IDs.
Args:
pubmed_ids: A list of strings where each string represents the PubMed ID (PMID)
of the article you want to fetch details for.
max_retries: The maximum number of retry attempts if an HTTP error occurs. Default is 3.
delay_seconds: The number of seconds to wait between retry attempts. Default is 5.
streamlit_context: A boolean flag indicating whether the code is running within a Streamlit app. Default is False.
Returns:
A list of dictionaries, where each dictionary contains the details of an article with the given PubMed ID.
"""
pubmed_ids = [str(id) for id in pubmed_ids]
ids_string = ",".join(pubmed_ids)
for attempt in range(self.max_retries + 1):
try:
handle = Entrez.efetch(
db="pubmed", id=ids_string, rettype="medline", retmode="text"
)
records = Medline.parse(handle)
records = list(records)
parsed_data = []
for record in records:
self._extract_record_data(record)
citation = self._format_apa_citation()
parsed_data.append(
{
"date_published": self._pub_month,
"title": self._title,
"keywords": self._keywords,
"abstract": self._abstract,
"pmid": self._pmid,
"authors": self._authors,
"journal": self._journal,
"citation": citation,
}
)
parsed_df = pd.DataFrame(parsed_data)
return parsed_df
except HTTPError as e:
error_message = f"PubMed didn't respond (attempt {attempt + 1}/{self.max_retries}): {e}"
if attempt < self.max_retries:
wait_message = f"Waiting {self.delay_seconds} seconds before trying PubMed again..."
print(error_message)
print(wait_message)
else:
final_message = "Giving up on PubMed. It was an issue on their end. You may want to try again later."
print(error_message)
print(final_message)
return None
def fetch_article_details_xml(self, pubmed_ids):
"""
The function fetches article details from PubMed using the provided PubMed IDs.
Args:
pubmed_ids: A list of strings where each string represents the PubMed ID (PMID)
of the article you want to fetch details for.
max_retries: The maximum number of retry attempts if an HTTP error occurs. Default is 3.
delay_seconds: The number of seconds to wait between retry attempts. Default is 5.
streamlit_context: A boolean flag indicating whether the code is running within a Streamlit app. Default is False.
Returns:
A list of dictionaries, where each dictionary contains the details of an article with the given PubMed ID.
"""
ids_string = ",".join(pubmed_ids)
for attempt in range(self.max_retries + 1):
try:
handle = Entrez.efetch(db="pubmed", id=ids_string, retmode="xml")
articles = Entrez.read(handle)["PubmedArticle"]
handle.close()
return articles
except HTTPError as e:
error_message = f"PubMed didn't respond (attempt {attempt + 1}/{self.max_retries}): {e}"
if attempt < self.max_retries:
wait_message = f"Waiting {self.delay_seconds} seconds before trying PubMed again..."
print(error_message)
print(wait_message)
if self.streamlit_context:
# TODO Ask about these imports...
st.warning(error_message)
st.warning(wait_message)
time.sleep(self.delay_seconds)
else:
final_message = "Giving up on PubMed. It was an issue on their end. You may want to try again later."
print(error_message)
print(final_message)
if self.streamlit_context:
st.warning(error_message)
st.error(final_message)
return []
|