ads.dataflow package#

Submodules#

ads.dataflow.dataflow module#

class ads.dataflow.dataflow.DataFlow(compartment_id=None, dataflow_base_folder='/home/datascience/dataflow', os_auth=None, df_auth=None)[source]#

Bases: object

create_app(app_config: dict, overwrite_script=False, overwrite_archive=False) → object[source]#

Create a new dataflow application with the supplied app config. app_config contains parameters needed to create a new application, according to oci.data_flow.models.CreateApplicationDetails.

Parameters:

app_config (dict) – the config file that contains all necessary parameters used to create a dataflow app
overwrite_script (bool) – whether to overwrite the existing pyscript script on Object Storage
overwrite_archive (bool) – whether to overwrite the existing archive file on Object Storage

Returns:

df_app – New dataflow application.

Return type:

oci.dataflow.models.Application

get_app(app_id: str)[source]#

Get the Project based on app_id.

Parameters:: app_id (str, required) – The OCID of the dataflow app to get.
Returns:: app – The oci.dataflow.models.Application with the matching ID.
Return type:: oci.dataflow.models.Application

list_apps(include_deleted: bool = False, compartment_id: str | None = None, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) → object[source]#

List all apps in a given compartment, or in the current notebook session’s compartment.

Parameters:

include_deleted (bool, optional, default=False) – Whether to include deleted apps in the returned list.
compartment_id (str, optional, default: NB_SESSION_COMPARTMENT_OCID) – The compartment specified to list apps.
datetime_format (str, optional, default: '%Y-%m-%d %H:%M:%S') – Change format for date time fields.

Returns:

dsl – List of Dataflow applications.

Return type:

List

load_app(app_id: str, target_folder: str | None = None) → object[source]#

Load an existing dataflow application based on application id. The existing dataflow application can be created either from dataflow service or the dataflow integration of ADS.

Parameters:

app_id (str, required) – The OCID of the dataflow app to load.
target_folder (str, optional,) – the folder to store the local artifacts of this application. If not specified, the target_folder will use the dataflow_base_folder by default.

Returns:

dfa – A dataflow application of type ads.dataflow.dataflow.DataFlowApp

Return type:

ads.dataflow.dataflow.DataFlowApp

prepare_app(display_name: str, script_bucket: str, pyspark_file_path: str, spark_version: str = '2.4.4', compartment_id: str | None = None, archive_path: str | None = None, archive_bucket: str | None = None, logs_bucket: str = 'dataflow-logs', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, arguments: list = [], script_parameters: dict = []) → dict[source]#

Check if the parameters provided by users to create an application are valid and then prepare app_configuration for creating an app or saving for future reuse.

Parameters:

display_name (str, required) – A user-friendly name. This name is not necessarily unique.
script_bucket (str, required) – bucket in object storage to upload the pyspark file
pyspark_file_path (str, required) – path to the pyspark file
spark_version (str) – Allowed values are “2.4.4”, “3.0.2”.
compartment_id (str) – OCID of the compartment to create a dataflow app. If not provided, compartment_id will use the same as the notebook session.
archive_path (str, optional) – path to the archive file
archive_bucket (str, optional) – bucket in object storage to upload the archive file
logs_bucket (str, default is 'dataflow-logs') – bucket in object storage to put run logs
driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
num_executors (int) – The number of executor VMs requested.
arguments (list of str) – The values passed into the command line string to run the application
script_parameters (dict) – The value of the parameters passed to the running application as command line arguments for the pyspark script.

Returns:

app_configuration

Return type:

dictionary containing all the validated params for CreateApplicationDetails.

template(job_type: str = 'standard_pyspark', script_str: str = '', file_dir: str | None = None, file_name: str | None = None) → str[source]#

Populate a prewritten pyspark or sparksql python script with user’s choice to write additional lines and save in local directory.

Parameters:

job_type (str, default is 'standard_pyspark') – Currently supports two types, ‘standard_pyspark’ or ‘sparksql’
script_str (str, optional, default is '') – code provided by user to write in the python script
file_dir (str, optional) – Directory to save the python script in local directory
file_name (str, optional) – name of the python script to save to the local directory

Returns:

script_path – Path to the template generated python file in local directory

Return type:

str

class ads.dataflow.dataflow.DataFlowApp(app_config, app_response, app_dir, oci_link, **kwargs)[source]#

Bases: DataFlow

property config: dict#

Retrieve the app_config file used to create the data flow app

Returns:: app_config – dictionary containing all the validated params for this DataFlowApp
Return type:: Dict

get_run(run_id: str)[source]#

Get the Run based on run_id

Parameters:: run_id (str, required) – The OCID of the dataflow run to get.
Returns:: df_run – The oci.dataflow.models.Run with the matching ID.
Return type:: oci.dataflow.models.Run

list_runs(include_failed: bool = False, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) → object[source]#

List all run of a dataflow app

Parameters:

include_failed (bool, optional, default=False) – Whether to include failed runs in the returned list
datetime_format (str, optional, default: '%Y-%m-%d %H:%M:%S') – Change format for date time fields

Returns:

df_runs – List of Data flow runs.

Return type:

List

property oci_link: object#

Retrieve the oci link of the data flow app

Returns:: oci_link – a link to the app page in an oci console.
Return type:: str

prepare_run(run_display_name: str, compartment_id: str | None = None, logs_bucket: str = '', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, **kwargs) → dict[source]#

Check if the parameters provided by users to create a run are valid and then prepare run_config for creating run details.

Parameters:

run_display_name (str) – A user-friendly name. This name is not necessarily unique.
compartment_id (str) – OCID of the compartment to create a dataflow run. If not provided, compartment_id will use the same as the dataflow app.
logs_bucket (str) – bucket in object storage to put run logs, if not provided, will use the same logs_bucket as defined in app_config
driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
num_executors (int) – The number of executor VMs requested.

Returns:

run_config – Dictionary containing all the validated params for CreateRunDetails.

Return type:

Dict

run(run_config: dict, save_log_to_local: bool = False, copy_script_to_object_storage: bool = True, copy_archive_to_object_storage: bool = True, pyspark_file_path: str | None = None, archive_path: str | None = None, wait: bool = True) → object[source]#

Create a new dataflow run with the supplied run config. run_config contains parameters needed to create a new run, according to oci.data_flow.models.CreateRunDetails.

Parameters:

run_config (dict, required) – The config file that contains all necessary parameters used to create a dataflow run
save_log_to_local (bool, optional) – A boolean value that defaults to false. If set to true, it saves the log files to local dir
copy_script_to_object_storage (bool, optional) – A boolean value that defaults to true. Local script will be copied to object storage
copy_archive_to_object_storage (bool, optional) – A boolean value that defaults to true. Local archive file will be copied to object storage
pyspark_file_path (str, optional) – The pyspark file path used for creating the dataflow app. if pyspark_file_path isn’t specified then reuse the path that the app was created with.
archive_path (str, optional) – The archive file path used for creating the dataflow app. if archive_path isn’t specified then reuse the path that the app was created with.
wait (bool, optional) – A boolean value that defaults to true. When True, the return will be ads.dataflow.dataflow.DataFlowRun in terminal state. When False, the return will be a ads.dataflow.dataflow.RunObserver.

Returns:

df_run – Either a new Data Flow run or a run observer.

Return type:

Variable

class ads.dataflow.dataflow.DataFlowLog(text, oci_path, log_local_dir)[source]#

Bases: object

head(n: int = 10)[source]#

Show the first n lines of the log as the output of the notebook cell

Parameters:: n (int, default is 10) – the number of lines from head of the log file
Return type:: None

property local_dir#

Get the local directory where the log file is saved.

Returns:: local_dir – Path to the local directory where the log file is saved.
Return type:: str

property local_path#

Get the path of the log file in local directory

Returns:: local_path – Path of the log file in local directory
Return type:: str

property oci_path#

Get the path of the log file in object storage

Returns:: oci_path – Path of the log file in object storage
Return type:: str

save(log_dir=None)[source]#

save the log file to a local directory.

Parameters:

log_dir (str,) – The path to the local directory to save log file, if not
set –
default. (log will be saved to the _local_dir by) –

Return type:

None

show_all()[source]#

Show all content of the log as the output of the notebook cell

Return type:: None

tail(n: int = 10)[source]#

Show the last n lines of the log as the output of the notebook cell

Parameters:: n (int, default is 10) – the number of lines from tail of the log file
Return type:: None

class ads.dataflow.dataflow.DataFlowRun(run_config, run_response, save_log_to_local, local_dir, **kwargs)[source]#

Bases: DataFlow

LOG_OUTPUTS = ['stdout', 'stderr']#

property config: dict#

Retrieve the run_config file used to create the Data Flow run

Returns:: run_config – dictionary containing all the validated params for this DataFlowRun
Return type:: Dict

fetch_log(log_type: str) → object[source]#

Fetch the log information of a run

Parameters:: log_type (str, have two values, 'stdout' or 'stderr') –
Returns:: dfl – a Data Flow log object
Return type:: DataFlowLog

property local_dir: str#

Retrieve the local directory of the data flow run

Returns:: local_dir – the local path to the Data Flow run
Return type:: str

property log_stderr: object#

Retrieve the stderr of the data flow run

Returns:: log_error – a clickable link that opens the stderror log in another tab in jupyter notebook environment
Return type:: ads.dataflow.dataflow.DataFlowLog

property log_stdout: object#

Retrieve the stdout of the data flow run

Returns:: log_out – a clickable link that opens the stdout log in another tab in a JupyterLab notebook environment
Return type:: ads.dataflow.dataflow.DataFlowLog

property oci_link: object#

Retrieve the oci link of the data flow run

Returns:: oci_link – link to the run page in an oci console
Return type:: str

property status: str#

Retrieve the status of the data flow run

Returns:: status – String that describes the status of the run
Return type:: str

update_config(param_dict) → None[source]#

Modify the run_config file used to create the data flow run

Parameters:: param_dict (Dict) – Dictionary containing the key value pairs of the run_config parameters and the updated values.
Return type:: None

class ads.dataflow.dataflow.RunObserver(app, run_config, save_log_to_local)[source]#

Bases: object

property config: dict#

Retrieve the run_config file used to create the data flow run

Returns:: run_config – Dictionary containing all the validated parameters for this Data Flow run
Return type:: Dict

property local_dir: str#

Retrieve the local directory of the data flow run

Returns:: local_dir – the local path to the Data Flow run
Return type:: str

property oci_link: object#

Retrieve the oci link of the data flow run

Returns:: oci_link – link to the run page in an oci console
Return type:: str

property status: str#: Returns the lifecycle state of the Data Flow run

update_config(param_dict) → None[source]#

Modify the run_config file used to create the data flow run

Parameters:: param_dict (Dict) – dictionary containing the key value pairs of the run_config parameters and the updated values.
Return type:: None

wait()[source]#

Wait and monitor the run creation process.

Parameters:: None –
Returns:: df_run – The oci.dataflow.models.Run after monitoring is done.
Return type:: oci.dataflow.models.Run

class ads.dataflow.dataflow.SPARK_VERSION[source]#

Bases: str

v2_4_4 = '2.4.4'#

v3_0_2 = '3.0.2'#

ads.dataflow.dataflowsummary module#

class ads.dataflow.dataflowsummary.SummaryList(entity_list, datetime_format='%Y-%m-%d %H:%M:%S')[source]#

Bases: list

abstract filter(selection, instance=None)[source]#: Abstract filter method for dataflow summary.

abstract sort_by(columns, reverse=False)[source]#: Abstract sort method for dataflow summary.

to_dataframe(datetime_format=None)[source]#: Abstract to_dataframe method for dataflow summary.

ads.dataflow package#

Submodules#

ads.dataflow.dataflow module#

ads.dataflow.dataflowsummary module#

Module contents#