ads.dataflow package#
Submodules#
ads.dataflow.dataflow module#
- class ads.dataflow.dataflow.DataFlow(compartment_id=None, dataflow_base_folder='/home/datascience/dataflow', os_auth=None, df_auth=None)[source]#
Bases:
object
- create_app(app_config: dict, overwrite_script=False, overwrite_archive=False) object [source]#
Create a new dataflow application with the supplied app config. app_config contains parameters needed to create a new application, according to oci.data_flow.models.CreateApplicationDetails.
- Parameters:
- Returns:
df_app – New dataflow application.
- Return type:
oci.dataflow.models.Application
- get_app(app_id: str)[source]#
Get the Project based on app_id.
- Parameters:
app_id (str, required) – The OCID of the dataflow app to get.
- Returns:
app – The oci.dataflow.models.Application with the matching ID.
- Return type:
oci.dataflow.models.Application
- list_apps(include_deleted: bool = False, compartment_id: str | None = None, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) object [source]#
List all apps in a given compartment, or in the current notebook session’s compartment.
- Parameters:
include_deleted (bool, optional, default=False) – Whether to include deleted apps in the returned list.
compartment_id (str, optional, default: NB_SESSION_COMPARTMENT_OCID) – The compartment specified to list apps.
datetime_format (str, optional, default: '%Y-%m-%d %H:%M:%S') – Change format for date time fields.
- Returns:
dsl – List of Dataflow applications.
- Return type:
List
- load_app(app_id: str, target_folder: str | None = None) object [source]#
Load an existing dataflow application based on application id. The existing dataflow application can be created either from dataflow service or the dataflow integration of ADS.
- Parameters:
- Returns:
dfa – A dataflow application of type ads.dataflow.dataflow.DataFlowApp
- Return type:
- prepare_app(display_name: str, script_bucket: str, pyspark_file_path: str, spark_version: str = '2.4.4', compartment_id: str | None = None, archive_path: str | None = None, archive_bucket: str | None = None, logs_bucket: str = 'dataflow-logs', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, arguments: list = [], script_parameters: dict = []) dict [source]#
Check if the parameters provided by users to create an application are valid and then prepare app_configuration for creating an app or saving for future reuse.
- Parameters:
display_name (str, required) – A user-friendly name. This name is not necessarily unique.
script_bucket (str, required) – bucket in object storage to upload the pyspark file
pyspark_file_path (str, required) – path to the pyspark file
spark_version (str) – Allowed values are “2.4.4”, “3.0.2”.
compartment_id (str) – OCID of the compartment to create a dataflow app. If not provided, compartment_id will use the same as the notebook session.
archive_path (str, optional) – path to the archive file
archive_bucket (str, optional) – bucket in object storage to upload the archive file
logs_bucket (str, default is 'dataflow-logs') – bucket in object storage to put run logs
driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
num_executors (int) – The number of executor VMs requested.
arguments (list of str) – The values passed into the command line string to run the application
script_parameters (dict) – The value of the parameters passed to the running application as command line arguments for the pyspark script.
- Returns:
app_configuration
- Return type:
dictionary containing all the validated params for CreateApplicationDetails.
- template(job_type: str = 'standard_pyspark', script_str: str = '', file_dir: str | None = None, file_name: str | None = None) str [source]#
Populate a prewritten pyspark or sparksql python script with user’s choice to write additional lines and save in local directory.
- Parameters:
job_type (str, default is 'standard_pyspark') – Currently supports two types, ‘standard_pyspark’ or ‘sparksql’
script_str (str, optional, default is '') – code provided by user to write in the python script
file_dir (str, optional) – Directory to save the python script in local directory
file_name (str, optional) – name of the python script to save to the local directory
- Returns:
script_path – Path to the template generated python file in local directory
- Return type:
- class ads.dataflow.dataflow.DataFlowApp(app_config, app_response, app_dir, oci_link, **kwargs)[source]#
Bases:
DataFlow
- property config: dict#
Retrieve the app_config file used to create the data flow app
- Returns:
app_config – dictionary containing all the validated params for this DataFlowApp
- Return type:
Dict
- get_run(run_id: str)[source]#
Get the Run based on run_id
- Parameters:
run_id (str, required) – The OCID of the dataflow run to get.
- Returns:
df_run – The oci.dataflow.models.Run with the matching ID.
- Return type:
oci.dataflow.models.Run
- list_runs(include_failed: bool = False, datetime_format: str = '%Y-%m-%d %H:%M:%S', **kwargs) object [source]#
List all run of a dataflow app
- property oci_link: object#
Retrieve the oci link of the data flow app
- Returns:
oci_link – a link to the app page in an oci console.
- Return type:
- prepare_run(run_display_name: str, compartment_id: str | None = None, logs_bucket: str = '', driver_shape: str = 'VM.Standard2.4', executor_shape: str = 'VM.Standard2.4', num_executors: int = 1, **kwargs) dict [source]#
Check if the parameters provided by users to create a run are valid and then prepare run_config for creating run details.
- Parameters:
run_display_name (str) – A user-friendly name. This name is not necessarily unique.
compartment_id (str) – OCID of the compartment to create a dataflow run. If not provided, compartment_id will use the same as the dataflow app.
logs_bucket (str) – bucket in object storage to put run logs, if not provided, will use the same logs_bucket as defined in app_config
driver_shape (str) – The value to assign to the driver_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
executor_shape (str) – The value to assign to the executor_shape property of this CreateApplicationDetails. Allowed values for this property are: “VM.Standard2.1”, “VM.Standard2.2”, “VM.Standard2.4”, “VM.Standard2.8”, “VM.Standard2.16”, “VM.Standard2.24”.
num_executors (int) – The number of executor VMs requested.
- Returns:
run_config – Dictionary containing all the validated params for CreateRunDetails.
- Return type:
Dict
- run(run_config: dict, save_log_to_local: bool = False, copy_script_to_object_storage: bool = True, copy_archive_to_object_storage: bool = True, pyspark_file_path: str | None = None, archive_path: str | None = None, wait: bool = True) object [source]#
Create a new dataflow run with the supplied run config. run_config contains parameters needed to create a new run, according to oci.data_flow.models.CreateRunDetails.
- Parameters:
run_config (dict, required) – The config file that contains all necessary parameters used to create a dataflow run
save_log_to_local (bool, optional) – A boolean value that defaults to false. If set to true, it saves the log files to local dir
copy_script_to_object_storage (bool, optional) – A boolean value that defaults to true. Local script will be copied to object storage
copy_archive_to_object_storage (bool, optional) – A boolean value that defaults to true. Local archive file will be copied to object storage
pyspark_file_path (str, optional) – The pyspark file path used for creating the dataflow app. if pyspark_file_path isn’t specified then reuse the path that the app was created with.
archive_path (str, optional) – The archive file path used for creating the dataflow app. if archive_path isn’t specified then reuse the path that the app was created with.
wait (bool, optional) – A boolean value that defaults to true. When True, the return will be ads.dataflow.dataflow.DataFlowRun in terminal state. When False, the return will be a ads.dataflow.dataflow.RunObserver.
- Returns:
df_run – Either a new Data Flow run or a run observer.
- Return type:
Variable
- class ads.dataflow.dataflow.DataFlowLog(text, oci_path, log_local_dir)[source]#
Bases:
object
- head(n: int = 10)[source]#
Show the first n lines of the log as the output of the notebook cell
- Parameters:
n (int, default is 10) – the number of lines from head of the log file
- Return type:
None
- property local_dir#
Get the local directory where the log file is saved.
- Returns:
local_dir – Path to the local directory where the log file is saved.
- Return type:
- property local_path#
Get the path of the log file in local directory
- Returns:
local_path – Path of the log file in local directory
- Return type:
- property oci_path#
Get the path of the log file in object storage
- Returns:
oci_path – Path of the log file in object storage
- Return type:
- save(log_dir=None)[source]#
save the log file to a local directory.
- Parameters:
log_dir (str,) – The path to the local directory to save log file, if not
set –
default. (log will be saved to the _local_dir by) –
- Return type:
None
- class ads.dataflow.dataflow.DataFlowRun(run_config, run_response, save_log_to_local, local_dir, **kwargs)[source]#
Bases:
DataFlow
- LOG_OUTPUTS = ['stdout', 'stderr']#
- property config: dict#
Retrieve the run_config file used to create the Data Flow run
- Returns:
run_config – dictionary containing all the validated params for this DataFlowRun
- Return type:
Dict
- fetch_log(log_type: str) object [source]#
Fetch the log information of a run
- Parameters:
log_type (str, have two values, 'stdout' or 'stderr') –
- Returns:
dfl – a Data Flow log object
- Return type:
- property local_dir: str#
Retrieve the local directory of the data flow run
- Returns:
local_dir – the local path to the Data Flow run
- Return type:
- property log_stderr: object#
Retrieve the stderr of the data flow run
- Returns:
log_error – a clickable link that opens the stderror log in another tab in jupyter notebook environment
- Return type:
- property log_stdout: object#
Retrieve the stdout of the data flow run
- Returns:
log_out – a clickable link that opens the stdout log in another tab in a JupyterLab notebook environment
- Return type:
- property oci_link: object#
Retrieve the oci link of the data flow run
- Returns:
oci_link – link to the run page in an oci console
- Return type:
- class ads.dataflow.dataflow.RunObserver(app, run_config, save_log_to_local)[source]#
Bases:
object
- property config: dict#
Retrieve the run_config file used to create the data flow run
- Returns:
run_config – Dictionary containing all the validated parameters for this Data Flow run
- Return type:
Dict
- property local_dir: str#
Retrieve the local directory of the data flow run
- Returns:
local_dir – the local path to the Data Flow run
- Return type:
- property oci_link: object#
Retrieve the oci link of the data flow run
- Returns:
oci_link – link to the run page in an oci console
- Return type: