ads.dataflow package#

Submodules#

ads.dataflow.dataflow module#

class ads.dataflow.dataflow.DataFlow(compartment_id=None, dataflow_base_folder='/home/datascience/dataflow', os_auth=None, df_auth=None)[source]#

Bases: object

create_app(app_config: dict, overwrite_script=False, overwrite_archive=False) object[source]#

Create a new dataflow application with the supplied app config. app_config contains parameters needed to create a new application, according to oci.data_flow.models.CreateApplicationDetails.

Parameters:
  • app_config (dict) – the config file that contains all necessary parameters used to create a dataflow app

  • overwrite_script (bool) – whether to overwrite the existing pyscript script on Object Storage

  • overwrite_archive (bool) – whether to overwrite the existing archive file on Object Storage

Returns:

df_app – New dataflow application.

Return type:

oci.dataflow.models.Application

get_app(app_id: str)[source]#

Get the Project based on app_id.

Parameters:

app_id (str, required) – The OCID of the dataflow app to get.

Returns:

app – The oci.dataflow.models.Application with the matching ID.

Return type:

oci.dataflow.models.Application

list_apps(include_deleted: bool = False, compartment_id: str | None = None, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) object[source]#

List all apps in a given compartment, or in the current notebook session’s compartment.

Parameters:
  • include_deleted (bool, optional, default=False) – Whether to include deleted apps in the returned list.

  • compartment_id (str, optional, default: NB_SESSION_COMPARTMENT_OCID) – The compartment specified to list apps.

  • datetime_format (str, optional, default: '%Y-%m-%d %H:%M:%S') – Change format for date time fields.

Returns:

dsl – List of Dataflow applications.

Return type:

List

load_app(app_id: str, target_folder: str | None = None) object[source]#

Load an existing dataflow application based on application id. The existing dataflow application can be created either from dataflow service or the dataflow integration of ADS.

Parameters:
  • app_id (str, required) – The OCID of the dataflow app to load.

  • target_folder (str, optional,) – the folder to store the local artifacts of this application. If not specified, the target_folder will use the dataflow_base_folder by default.

Returns:

dfa – A dataflow application of type ads.dataflow.dataflow.DataFlowApp

Return type:

ads.dataflow.dataflow.DataFlowApp

prepare_app(display_name: str, script_bucket: str, pyspark_file_path: str, spark_version: str = '2.4.4', compartment_id: str | None = None, archive_path: str | None = None, archive_bucket: str | None = None, logs_bucket: str = 'dataflow-logs', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, arguments: list = [], script_parameters: dict = []) dict[source]#

Check if the parameters provided by users to create an application are valid and then prepare app_configuration for creating an app or saving for future reuse.

Parameters:
  • display_name (str, required) – A user-friendly name. This name is not necessarily unique.

  • script_bucket (str, required) – bucket in object storage to upload the pyspark file

  • pyspark_file_path (str, required) – path to the pyspark file

  • spark_version (str) – Allowed values are “2.4.4”, “3.0.2”.

  • compartment_id (str) – OCID of the compartment to create a dataflow app. If not provided, compartment_id will use the same as the notebook session.

  • archive_path (str, optional) – path to the archive file

  • archive_bucket (str, optional) – bucket in object storage to upload the archive file

  • logs_bucket (str, default is 'dataflow-logs') – bucket in object storage to put run logs

  • driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.

  • executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.

  • num_executors (int) – The number of executor VMs requested.

  • arguments (list of str) – The values passed into the command line string to run the application

  • script_parameters (dict) – The value of the parameters passed to the running application as command line arguments for the pyspark script.

Returns:

app_configuration

Return type:

dictionary containing all the validated params for CreateApplicationDetails.

template(job_type: str = 'standard_pyspark', script_str: str = '', file_dir: str | None = None, file_name: str | None = None) str[source]#

Populate a prewritten pyspark or sparksql python script with user’s choice to write additional lines and save in local directory.

Parameters:
  • job_type (str, default is 'standard_pyspark') – Currently supports two types, ‘standard_pyspark’ or ‘sparksql’

  • script_str (str, optional, default is '') – code provided by user to write in the python script

  • file_dir (str, optional) – Directory to save the python script in local directory

  • file_name (str, optional) – name of the python script to save to the local directory

Returns:

script_path – Path to the template generated python file in local directory

Return type:

str

class ads.dataflow.dataflow.DataFlowApp(app_config, app_response, app_dir, oci_link, **kwargs)[source]#

Bases: DataFlow

property config: dict#

Retrieve the app_config file used to create the data flow app

Returns:

app_config – dictionary containing all the validated params for this DataFlowApp

Return type:

Dict

get_run(run_id: str)[source]#

Get the Run based on run_id

Parameters:

run_id (str, required) – The OCID of the dataflow run to get.

Returns:

df_run – The oci.dataflow.models.Run with the matching ID.

Return type:

oci.dataflow.models.Run

list_runs(include_failed: bool = False, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) object[source]#

List all run of a dataflow app

Parameters:
  • include_failed (bool, optional, default=False) – Whether to include failed runs in the returned list

  • datetime_format (str, optional, default: '%Y-%m-%d %H:%M:%S') – Change format for date time fields

Returns:

df_runs – List of Data flow runs.

Return type:

List

Retrieve the oci link of the data flow app

Returns:

oci_link – a link to the app page in an oci console.

Return type:

str

prepare_run(run_display_name: str, compartment_id: str | None = None, logs_bucket: str = '', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, **kwargs) dict[source]#

Check if the parameters provided by users to create a run are valid and then prepare run_config for creating run details.

Parameters:
  • run_display_name (str) – A user-friendly name. This name is not necessarily unique.

  • compartment_id (str) – OCID of the compartment to create a dataflow run. If not provided, compartment_id will use the same as the dataflow app.

  • logs_bucket (str) – bucket in object storage to put run logs, if not provided, will use the same logs_bucket as defined in app_config

  • driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.

  • executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.

  • num_executors (int) – The number of executor VMs requested.

Returns:

run_config – Dictionary containing all the validated params for CreateRunDetails.

Return type:

Dict

run(run_config: dict, save_log_to_local: bool = False, copy_script_to_object_storage: bool = True, copy_archive_to_object_storage: bool = True, pyspark_file_path: str | None = None, archive_path: str | None = None, wait: bool = True) object[source]#

Create a new dataflow run with the supplied run config. run_config contains parameters needed to create a new run, according to oci.data_flow.models.CreateRunDetails.

Parameters:
  • run_config (dict, required) – The config file that contains all necessary parameters used to create a dataflow run

  • save_log_to_local (bool, optional) – A boolean value that defaults to false. If set to true, it saves the log files to local dir

  • copy_script_to_object_storage (bool, optional) – A boolean value that defaults to true. Local script will be copied to object storage

  • copy_archive_to_object_storage (bool, optional) – A boolean value that defaults to true. Local archive file will be copied to object storage

  • pyspark_file_path (str, optional) – The pyspark file path used for creating the dataflow app. if pyspark_file_path isn’t specified then reuse the path that the app was created with.

  • archive_path (str, optional) – The archive file path used for creating the dataflow app. if archive_path isn’t specified then reuse the path that the app was created with.

  • wait (bool, optional) – A boolean value that defaults to true. When True, the return will be ads.dataflow.dataflow.DataFlowRun in terminal state. When False, the return will be a ads.dataflow.dataflow.RunObserver.

Returns:

df_run – Either a new Data Flow run or a run observer.

Return type:

Variable

class ads.dataflow.dataflow.DataFlowLog(text, oci_path, log_local_dir)[source]#

Bases: object

head(n: int = 10)[source]#

Show the first n lines of the log as the output of the notebook cell

Parameters:

n (int, default is 10) – the number of lines from head of the log file

Return type:

None

property local_dir#

Get the local directory where the log file is saved.

Returns:

local_dir – Path to the local directory where the log file is saved.

Return type:

str

property local_path#

Get the path of the log file in local directory

Returns:

local_path – Path of the log file in local directory

Return type:

str

property oci_path#

Get the path of the log file in object storage

Returns:

oci_path – Path of the log file in object storage

Return type:

str

save(log_dir=None)[source]#

save the log file to a local directory.

Parameters:
  • log_dir (str,) – The path to the local directory to save log file, if not

  • set

  • default. (log will be saved to the _local_dir by) –

Return type:

None

show_all()[source]#

Show all content of the log as the output of the notebook cell

Return type:

None

tail(n: int = 10)[source]#

Show the last n lines of the log as the output of the notebook cell

Parameters:

n (int, default is 10) – the number of lines from tail of the log file

Return type:

None

class ads.dataflow.dataflow.DataFlowRun(run_config, run_response, save_log_to_local, local_dir, **kwargs)[source]#

Bases: DataFlow

LOG_OUTPUTS = ['stdout', 'stderr']#
property config: dict#

Retrieve the run_config file used to create the Data Flow run

Returns:

run_config – dictionary containing all the validated params for this DataFlowRun

Return type:

Dict

fetch_log(log_type: str) object[source]#

Fetch the log information of a run

Parameters:

log_type (str, have two values, 'stdout' or 'stderr') –

Returns:

dfl – a Data Flow log object

Return type:

DataFlowLog

property local_dir: str#

Retrieve the local directory of the data flow run

Returns:

local_dir – the local path to the Data Flow run

Return type:

str

property log_stderr: object#

Retrieve the stderr of the data flow run

Returns:

log_error – a clickable link that opens the stderror log in another tab in jupyter notebook environment

Return type:

ads.dataflow.dataflow.DataFlowLog

property log_stdout: object#

Retrieve the stdout of the data flow run

Returns:

log_out – a clickable link that opens the stdout log in another tab in a JupyterLab notebook environment

Return type:

ads.dataflow.dataflow.DataFlowLog

Retrieve the oci link of the data flow run

Returns:

oci_link – link to the run page in an oci console

Return type:

str

property status: str#

Retrieve the status of the data flow run

Returns:

status – String that describes the status of the run

Return type:

str

update_config(param_dict) None[source]#

Modify the run_config file used to create the data flow run

Parameters:

param_dict (Dict) – Dictionary containing the key value pairs of the run_config parameters and the updated values.

Return type:

None

class ads.dataflow.dataflow.RunObserver(app, run_config, save_log_to_local)[source]#

Bases: object

property config: dict#

Retrieve the run_config file used to create the data flow run

Returns:

run_config – Dictionary containing all the validated parameters for this Data Flow run

Return type:

Dict

property local_dir: str#

Retrieve the local directory of the data flow run

Returns:

local_dir – the local path to the Data Flow run

Return type:

str

Retrieve the oci link of the data flow run

Returns:

oci_link – link to the run page in an oci console

Return type:

str

property status: str#

Returns the lifecycle state of the Data Flow run

update_config(param_dict) None[source]#

Modify the run_config file used to create the data flow run

Parameters:

param_dict (Dict) – dictionary containing the key value pairs of the run_config parameters and the updated values.

Return type:

None

wait()[source]#

Wait and monitor the run creation process.

Parameters:

None

Returns:

df_run – The oci.dataflow.models.Run after monitoring is done.

Return type:

oci.dataflow.models.Run

class ads.dataflow.dataflow.SPARK_VERSION[source]#

Bases: str

v2_4_4 = '2.4.4'#
v3_0_2 = '3.0.2'#

ads.dataflow.dataflowsummary module#

class ads.dataflow.dataflowsummary.SummaryList(entity_list, datetime_format='%Y-%m-%d %H:%M:%S')[source]#

Bases: list

abstract filter(selection, instance=None)[source]#

Abstract filter method for dataflow summary.

abstract sort_by(columns, reverse=False)[source]#

Abstract sort method for dataflow summary.

to_dataframe(datetime_format=None)[source]#

Abstract to_dataframe method for dataflow summary.

Module contents#