Skip to content

Commit

Permalink
Fixed conflict with read_file definition.
Browse files Browse the repository at this point in the history
  • Loading branch information
miballe committed Oct 12, 2024
2 parents c465ad4 + 61b562d commit cbb3ce8
Showing 1 changed file with 19 additions and 12 deletions.
31 changes: 19 additions & 12 deletions src/factiva/analytics/integration/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,18 +7,21 @@
class SnapshotFiles(object):


def read_avro_file(self, filepath, only_stats=False, merge_body=False) -> pd.DataFrame:
def read_file(self, filepath, stats_only=False, merge_body=False, all_fields=False) -> pd.DataFrame:
"""Reads a single Dow Jones snapshot datafile
Parameters
----------
filepath : str
Relative or absolute file path
only_stats : bool, optional
stats_only : bool, optional
Specifies if only file metadata is loaded (True), or if the full article content is loaded (False). On average,
only_stats loads about 1/10 and is recommended for quick metadata-based analysis. (Default is False)
merge_body : bool, optional
Specifies if the body field should be merged with the snippet and this last column being dropped.
(default is False)
all_fields : bool, optional
If set, all fields are loaded to the Pandas DataFrame. If set to `True`, parameters `stats_only` and
`merge_body` are ignored.
Returns
-------
pandas.DataFrame
Expand All @@ -29,19 +32,23 @@ def read_avro_file(self, filepath, only_stats=False, merge_body=False) -> pd.Dat
records = [r for r in reader]
r_df = pd.DataFrame.from_records(records)

if only_stats is True:
r_df = r_df[const.SNAPSHOT_FILE_STATS_FIELDS]
if all_fields is False:
if stats_only is True:
r_df = r_df[const.SNAPSHOT_FILE_STATS_FIELDS]

if (only_stats is False) & (merge_body is True):
r_df['body'] = r_df['snippet'] + '\n\n' + r_df['body']
r_df.drop('snippet', axis=1, inplace=True)
if (stats_only is False) & (merge_body is True):
r_df['body'] = r_df['snippet'] + '\n\n' + r_df['body']
r_df.drop('snippet', axis=1, inplace=True)

if only_stats is False:
r_df['body'] = r_df[['body']].apply(lambda x: '{}'.format(x[0]), axis=1)
if stats_only is False:
r_df['body'] = r_df[['body']].apply(lambda x: '{}'.format(x[0]), axis=1)

for d_field in const.SNAPSHOT_FILE_DELETE_FIELDS:
if d_field in r_df.columns:
r_df.drop(d_field, axis=1, inplace=True)
for d_field in const.SNAPSHOT_FILE_DELETE_FIELDS:
if d_field in r_df.columns:
r_df.drop(d_field, axis=1, inplace=True)
else:
# TODO: Support merge_body for when all_fields is True
r_df['body'] = r_df[['body']].apply(lambda x: '{}'.format(x[0]), axis=1)

r_df['publication_datetime'] = r_df['publication_datetime'].astype('datetime64[ms]')
r_df['modification_datetime'] = r_df['modification_datetime'].astype('datetime64[ms]')
Expand Down

0 comments on commit cbb3ce8

Please sign in to comment.