graviti.portex.extractors#

Schema to colomn extractors related methods.

Module Contents#

Functions#

get_extractors(schema)

Get the extractors and dtypes for colomns.

Attributes#

graviti.portex.extractors.Extractors[source]#
graviti.portex.extractors.get_extractors(schema)[source]#

Get the extractors and dtypes for colomns.

Parameters

schema (Dict[str, Any]) – The schema of a DataFrame.

Returns

A dict containing the extractors and dtypes for all colomns.

Return type

Extractors

Examples

>>> import yaml
>>>
>>> from graviti.client import list_data_details
>>> from graviti.utility.lazy import LazyFactory, LazyList
>>> from graviti.portex import catalog_to_schema, get_extractors
>>>
>>> from tensorbay import GAS
>>> from tensorbay.dataset import Dataset
>>> ACCESSKEY = "ACCESSKEY-xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx"
>>> URL = "https://gas.graviti.com/"
>>> DATASET_NAME = "MNIST"
>>> TOTAL_COUNT = 1000
>>>
>>> gas = GAS(ACCESSKEY)
>>> dataset = Dataset(DATASET_NAME, gas)
>>> dataset_client = gas.get_dataset(DATASET_NAME)
>>>
>>> getter = lambda offset, limit: list_data_details(
...     url=URL,
...     access_key=ACCESSKEY,
...     dataset_id=dataset_client.dataset_id,
...     segment_name="train",
...     commit=dataset_client.status.commit_id,
...     offset=offset,
...     limit=limit,
... )
>>> factory = LazyFactory(TOTAL_COUNT, 128, getter)
>>> schema = yaml.load(
...    catalog_to_schema(dataset.catalog, dataset["train"][0], dataset.notes), yaml.Loader
... )
>>> extractors = get_extractors(schema)
>>> lazy_lists = {}
>>> for key, arguments in extractors.items():
...     lazy_lists[key] = factory.create_list(*arguments)
>>> lazy_lists
{'filename': LazyList [
   'train_image_00000.png',
   'train_image_00001.png',
   'train_image_00002.png',
   'train_image_00003.png',
   'train_image_00004.png',
   'train_image_00005.png',
   'train_image_00006.png',
   'train_image_00007.png',
   'train_image_00008.png',
   'train_image_00009.png',
   'train_image_00010.png',
   'train_image_00011.png',
   'train_image_00012.png',
   'train_image_00013.png',
   ... (985 items are folded),
   'train_image_00999.png'
 ],
 'image': LazyList [
   RemoteFileMixin("train_image_00000.png"),
   RemoteFileMixin("train_image_00001.png"),
   RemoteFileMixin("train_image_00002.png"),
   RemoteFileMixin("train_image_00003.png"),
   RemoteFileMixin("train_image_00004.png"),
   RemoteFileMixin("train_image_00005.png"),
   RemoteFileMixin("train_image_00006.png"),
   RemoteFileMixin("train_image_00007.png"),
   RemoteFileMixin("train_image_00008.png"),
   RemoteFileMixin("train_image_00009.png"),
   RemoteFileMixin("train_image_00010.png"),
   RemoteFileMixin("train_image_00011.png"),
   RemoteFileMixin("train_image_00012.png"),
   RemoteFileMixin("train_image_00013.png"),
   ... (985 items are folded),
   RemoteFileMixin("train_image_00999.png")
 ],
 'category': LazyList [
   '5',
   '0',
   '4',
   '1',
   '9',
   '2',
   '1',
   '3',
   '1',
   '4',
   '3',
   '5',
   '3',
   '6',
   ... (985 items are folded),
   '6'
 ]}