Source code for pylyrics2.download_data

# Authors: Abhiket Gaurav, Artan Zandian, Macy Chan, Manju Abhinandana Kumar
# January 2022

import os
import pandas as pd
import kaggle


[docs]def download_data(dataset, file_path, columns):
    """
    Downloads dataset from kaggle to filepath and creates a dataframe with input columns

    Parameters
    ----------
    dataset: str
        kaggle dataset name to download
    file_path: str
        location to save the file
    columns: list
        list of columns to create a dataframe

    Returns
    -------
    df:
        A dataframe with the given column names

    Example
    -------
    from pylyrics2 import download_data
    download_data("geomack/spotifyclassification", "data/spotify_attributes", ["song_title", "artist"])
    spotify_df = download_data("geomack/spotifyclassification", "data/spotify_attributes", ["song_title", "artist"])
    """
    try:

        if not (type(dataset)) == str:
            raise TypeError("Dataset should be of type string.")
        if not (type(file_path)) == str:
            raise TypeError("File_path should be of type string.")
        if not (type(columns)) == list:
            raise TypeError("The column names should be of type list")
        if not (len(columns)) == 2:
            raise TypeError("Two columns should be retrieved")

        directory = os.path.dirname(file_path)
        if not os.path.exists(directory):
            os.makedirs(directory)

        kaggle.api.authenticate()
        kaggle.api.dataset_download_files(
            dataset,
            path=file_path,
            unzip=True,
        )

        df = pd.read_csv((file_path + "/" + str(os.listdir(file_path).pop())))

        if set(columns).issubset(df.columns):
            df = df[columns]
        else:
            raise ValueError("Incorrect column names, please check again")

        return df

    except (TypeError, ValueError) as req:
        print(req)
        raise