Managing machine learning datasets
Model datasets using simple abstractions.
Introduction to Custom Dataset Classes
Implementing Dataset Classes for Different Data Sources
from abc import ABC, abstractmethod
import pandas as pd
from google.cloud import bigquery
from typing import Optional
class Dataset(ABC):
@abstractmethod
def read_data(self) -> pd.DataFrame:
pass
class CSVDataset(Dataset):
def __init__(self, data_path: str, df: Optional[pd.DataFrame] = None):
self.data_path = data_path
self.df = df
def read_data(self) -> pd.DataFrame:
if self.df is None:
self.df = pd.read_csv(self.data_path)
return self.df
class BigQueryDataset(Dataset):
def __init__(
self,
table_id: str,
df: Optional[pd.DataFrame] = None,
project: Optional[str] = None,
):
self.table_id = table_id
self.project = project
self.df = df
self.client = bigquery.Client(project=self.project)
def read_data(self) -> pd.DataFrame:
query = f"SELECT * FROM `{self.table_id}`"
self.df = self.client.query(query).to_dataframe()
return self.df
def write_data(self) -> None:
job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
job = self.client.load_table_from_dataframe(self.df, self.table_id, job_config=job_config)
job.result()Creating Custom Materializers
Managing Complexity in Pipelines with Multiple Data Sources
Best Practices for Designing Flexible and Maintainable Pipelines
Next steps
Last updated
Was this helpful?