Source code for pylyrics2.download_data

# Authors: Abhiket Gaurav, Artan Zandian, Macy Chan, Manju Abhinandana Kumar
# January 2022

import os
import pandas as pd
import kaggle


[docs]def download_data(dataset, file_path, columns): """ Downloads dataset from kaggle to filepath and creates a dataframe with input columns Parameters ---------- dataset: str kaggle dataset name to download file_path: str location to save the file columns: list list of columns to create a dataframe Returns ------- df: A dataframe with the given column names Example ------- from pylyrics2 import download_data download_data("geomack/spotifyclassification", "data/spotify_attributes", ["song_title", "artist"]) spotify_df = download_data("geomack/spotifyclassification", "data/spotify_attributes", ["song_title", "artist"]) """ try: if not (type(dataset)) == str: raise TypeError("Dataset should be of type string.") if not (type(file_path)) == str: raise TypeError("File_path should be of type string.") if not (type(columns)) == list: raise TypeError("The column names should be of type list") if not (len(columns)) == 2: raise TypeError("Two columns should be retrieved") directory = os.path.dirname(file_path) if not os.path.exists(directory): os.makedirs(directory) kaggle.api.authenticate() kaggle.api.dataset_download_files( dataset, path=file_path, unzip=True, ) df = pd.read_csv((file_path + "/" + str(os.listdir(file_path).pop()))) if set(columns).issubset(df.columns): df = df[columns] else: raise ValueError("Incorrect column names, please check again") return df except (TypeError, ValueError) as req: print(req) raise