Skip to content

Commit

Permalink
Add split UDF func to ease certain string handling (#2039)
Browse files Browse the repository at this point in the history
Summary: Add split UDF func to ease certain string handling

This is functionality that I'm planning to use for my upcoming Kubecon
demo/talk. For this use case, I want to access the individual fields of
the XFCC header just like the example in the UDF docstring.

Relevant Issues: N/A

Type of change: /kind feature

Test Plan: New tests pass

Changelog Message: Added `px.split` function to support parsing strings
that contain delimiters

---------

Signed-off-by: Dom Del Nano <[email protected]>
  • Loading branch information
ddelnano authored Oct 11, 2024
1 parent 3c41d55 commit 30c72a1
Show file tree
Hide file tree
Showing 3 changed files with 47 additions and 0 deletions.
1 change: 1 addition & 0 deletions src/carnot/funcs/builtins/json_ops.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ void RegisterJSONOpsOrDie(udf::Registry* registry) {
registry->RegisterOrDie<PluckAsInt64UDF>("pluck_int64");
registry->RegisterOrDie<PluckAsFloat64UDF>("pluck_float64");
registry->RegisterOrDie<PluckArrayUDF>("pluck_array");
registry->RegisterOrDie<SplitUDF>("split");

// Up to 8 script args are supported for the _script_reference UDF, due to the lack of support for
// variadic UDF arguments in the UDF registry today. We should clean this up if/when variadic UDF
Expand Down
36 changes: 36 additions & 0 deletions src/carnot/funcs/builtins/json_ops.h
Original file line number Diff line number Diff line change
Expand Up @@ -228,6 +228,42 @@ class PluckArrayUDF : public udf::ScalarUDF {
}
};

class SplitUDF : public udf::ScalarUDF {
public:
StringValue Exec(FunctionContext*, StringValue in, StringValue delimiter) {
rapidjson::StringBuffer sb;
rapidjson::Writer<rapidjson::StringBuffer> writer(sb);
writer.StartArray();

for (absl::string_view part : absl::StrSplit(in.data(), delimiter.data())) {
writer.String(part.data(), part.size());
}

writer.EndArray();
return sb.GetString();
}

static udf::ScalarUDFDocBuilder Doc() {
return udf::ScalarUDFDocBuilder(
"Splits a string by a delimiter and a returns JSON encoded array of strings.")
.Details(
"This function splits a string by a delimiter and returns a JSON encoded array of "
"strings. The function is useful for splitting strings and then passing the result to "
"px.pluck_array in order to access individual values of a delimited string.")
.Example(R"doc(
| df = px.DataFrame('http_events', start_time='-5m')
| # Returns By=http://frontend.px.dev;URI=http://testclient.px.dev
| df.xfcc_hdr = px.pluck(df.req_headers, 'X-Forwarded-Client-Cert')
| df.xfcc_parts = px.split(df.xfcc_hdr, ';')
| df.by = px.pluck_array(df.xfcc_hdr, 0) # Returns "By=http://frontend.px.dev"
| df.uri = px.pluck_array(df.xfcc_hdr, 1) # Returns "URI=http://testclient.px.dev"
)doc")
.Arg("input_str", "The string to split.")
.Arg("delimiter", "The string value to split the input string.")
.Returns("A JSON encoded array of the split strings.");
}
};

/**
DocString intentionally omitted, this is a non-public function.
This function creates a custom deep link by creating a "script reference" from a label,
Expand Down
10 changes: 10 additions & 0 deletions src/carnot/funcs/builtins/json_ops_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,16 @@ TEST(JSONOps, PluckArrayUDF_index_out_of_bound) {
udf_tester.ForInput(kTestJSONArray, 3).Expect("");
}

TEST(JSONOps, SplitUDF_with_present_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ",").Expect(R"(["foo","bar","baz"])");
}

TEST(JSONOps, SplitUDF_with_missing_delimiter) {
auto udf_tester = udf::UDFTester<SplitUDF>();
udf_tester.ForInput("foo,bar,baz", ";").Expect(R"(["foo,bar,baz"])");
}

TEST(JSONOps, ScriptReferenceUDF_no_args) {
auto udf_tester = udf::UDFTester<ScriptReferenceUDF<>>();
auto res = udf_tester.ForInput("text", "px/script").Result();
Expand Down

0 comments on commit 30c72a1

Please sign in to comment.