#Script version information
{
"version": "1.0.0",
"last_edited": "2024-10-18 10.50.41",
"edited_by": "MKumar",
"script_type": "module"
}
##############################################################################################################################
#Imports:
import pandas as pd
from openpyxl import Workbook, load_workbook
##############################################################################################################################
# Functions:
def read_excel_to_dataframe(file_path, sheet_name, read_entire_worksheet=True, cell_range=None😞
'''
Read specific workbook and worksheet data into a pandas DataFrame.
Parameters:
- file_path: Path to the Excel file.
- sheet_name: Name of the worksheet.
- read_entire_worksheet: Boolean flag to determine if entire worksheet should be read.
If True, reads entire worksheet, if False, a range must be provided.
- cell_range: Excel cell range (e.g., 'A1:C10') to read, only required when read_entire_worksheet is False.
Return:
- df: pandas DataFrame containing the data from the specified sheet.
# Example usage:
# df = read_excel_to_dataframe('file_path.xlsx', 'Sheet1', True)
# If read_entire_worksheet is False, range should be provided:
# df = read_excel_to_dataframe('file_path.xlsx', 'Sheet1', False, 'A1:C10')
'''
# Load workbook and the specified worksheet
workbook = load_workbook(file_path, data_only=True)
worksheet = workbook[sheet_name]
# Read entire worksheet if the flag is True
if read_entire_worksheet:
data = worksheet.values
# Convert to DataFrame and set header from the first row
df = pd.DataFrame(data)
# Set first row as header
df.columns = df.iloc[0] # Set the header
df = df[1:] # Remove the header row from data
else:
if cell_range is None:
raise ValueError("cell_range must be provided when read_entire_worksheet is False")
# Fetch data from the specified range
data = worksheet[cell_range]
data = [[cell.value for cell in row] for row in data]
df = pd.DataFrame(data)
# Optionally drop any completely empty columns
df = df.dropna(axis=1, how='all')
return df
##############################################################################################################################
def write_dataframe_to_excel(file_path, sheet_name, start_cell, df😞
'''
Write a pandas DataFrame to a specific worksheet in an Excel file.
Parameters:
- file_path: Path to the Excel file.
- sheet_name: Name of the worksheet where data should be written.
- start_cell: The starting cell (e.g., 'A1') where the DataFrame should be written.
- df: pandas DataFrame containing the data to write.
Return:
- None
# Example usage:
# df = pd.DataFrame({'Col1': [1, 2], 'Col2': [3, 4]})
# write_dataframe_to_excel('file_path.xlsx', 'Sheet1', 'A1', df)
'''
# Load workbook and the specified worksheet
workbook = load_workbook(file_path)
worksheet = workbook[sheet_name]
# Get the starting row and column from the start_cell
start_row = worksheet[start_cell].row
start_col = worksheet[start_cell].column
# Write DataFrame to worksheet
for r_idx, row in enumerate(df.values, start=start_row😞
for c_idx, value in enumerate(row, start=start_col😞
worksheet.cell(row=r_idx, column=c_idx, value=value)
# Save the workbook
workbook.save(file_path)
##############################################################################################################################
Step 2: Use this module in one of your custom scripts something like - get_worksheet_as_collection.py - #Script version information
{
"version": "1.0.0",
"last_edited": "2024-10-18 10.50.41",
"edited_by": "MKumar",
"script_type": "module"
}
##############################################################################################################################
import Robotics_OpenPYXL_Basic as excel
import pandas as pd
import json
import sys
import os
#Read JSON file and assign values to variables
inputs_str = r'' + sys.argv[1]
json_data = json.loads(inputs_str)
file_path = json_data['filepath_spreadsheet']
worksheet = json_data['worksheet']
flag = json_data['read_entire_worksheet']
range = json_data['cell_range']
output_folder = json_data['output_folder']
##############################################################################################################################
# Call the read function from the custom library
try:
# Use the read function from your custom library
df = excel.read_excel_to_dataframe(file_path, worksheet, flag, range)
# Drop columns that are entirely NaN
df = df.dropna(axis=1, how='all')
df = pd.DataFrame(df.values, columns=df.columns)
df.reset_index(drop=True, inplace=True)
# Save the original DataFrame to a CSV file in the output folder
output_file_path = os.path.join(output_folder, 'output.csv')
df.to_csv(output_file_path, index=False, sep=',', encoding='utf-8')
# Prepare outputs for the response
outputs = {
"status": "success",
"output_file": output_file_path
}
# Convert the output dictionary to a JSON string and print it
output_str = json.dumps(outputs)
output_str = 'output=' + output_str
print(output_str)
except Exception as e:
print(f"An error occurred: {e}")
Step 3: Use Blueprism - Utility Script execution to run this script - once the script is successful and an output CSV file is created you can use - Utility file management action - Get CSV Text as Collection - Chunk as mentioned by @tejaswi_ or the other action is read all text from file keeping "UTF-16" as default encoding (should be used for smaller data set).