from datetime import datetime import re import numpy as np import pytest import pandas as pd from pandas import ( DataFrame, Index, MultiIndex, Series, _testing as tm, ) from pandas.tests.strings import ( _convert_na_value, is_object_or_nan_string_dtype, ) @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split(any_string_dtype, method): values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = getattr(values.str, method)("_") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split_more_than_one_char(any_string_dtype, method): # more than one char values = Series(["a__b__c", "c__d__e", np.nan, "f__g__h"], dtype=any_string_dtype) result = getattr(values.str, method)("__") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) result = getattr(values.str, method)("__", expand=False) tm.assert_series_equal(result, exp) def test_split_more_regex_split(any_string_dtype): # regex split values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.split("[,_]") exp = Series([["a", "b", "c"], ["c", "d", "e"], np.nan, ["f", "g", "h"]]) exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) def test_split_regex(any_string_dtype): # GH 43563 # explicit regex = True split values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) result = values.str.split(r"\.jpg", regex=True) exp = Series([["xxxjpgzzz", ""]]) tm.assert_series_equal(result, exp) def test_split_regex_explicit(any_string_dtype): # explicit regex = True split with compiled regex regex_pat = re.compile(r".jpg") values = Series("xxxjpgzzz.jpg", dtype=any_string_dtype) result = values.str.split(regex_pat) exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) # explicit regex = False split result = values.str.split(r"\.jpg", regex=False) exp = Series([["xxxjpgzzz.jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length == 1 result = values.str.split(r".") exp = Series([["xxxjpgzzz", "jpg"]]) tm.assert_series_equal(result, exp) # non explicit regex split, pattern length != 1 result = values.str.split(r".jpg") exp = Series([["xx", "zzz", ""]]) tm.assert_series_equal(result, exp) # regex=False with pattern compiled regex raises error with pytest.raises( ValueError, match="Cannot use a compiled regex as replacement pattern with regex=False", ): values.str.split(regex_pat, regex=False) @pytest.mark.parametrize("expand", [None, False]) @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split_object_mixed(expand, method): mixed = Series(["a_b_c", np.nan, "d_e_f", True, datetime.today(), None, 1, 2.0]) result = getattr(mixed.str, method)("_", expand=expand) exp = Series( [ ["a", "b", "c"], np.nan, ["d", "e", "f"], np.nan, np.nan, None, np.nan, np.nan, ] ) assert isinstance(result, Series) tm.assert_almost_equal(result, exp) @pytest.mark.parametrize("method", ["split", "rsplit"]) @pytest.mark.parametrize("n", [None, 0]) def test_split_n(any_string_dtype, method, n): s = Series(["a b", pd.NA, "b c"], dtype=any_string_dtype) expected = Series([["a", "b"], pd.NA, ["b", "c"]]) result = getattr(s.str, method)(" ", n=n) expected = _convert_na_value(s, expected) tm.assert_series_equal(result, expected) def test_rsplit(any_string_dtype): # regex split is not supported by rsplit values = Series(["a,b_c", "c_d,e", np.nan, "f,g,h"], dtype=any_string_dtype) result = values.str.rsplit("[,_]") exp = Series([["a,b_c"], ["c_d,e"], np.nan, ["f,g,h"]]) exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) def test_rsplit_max_number(any_string_dtype): # setting max number of splits, make sure it's from reverse values = Series(["a_b_c", "c_d_e", np.nan, "f_g_h"], dtype=any_string_dtype) result = values.str.rsplit("_", n=1) exp = Series([["a_b", "c"], ["c_d", "e"], np.nan, ["f_g", "h"]]) exp = _convert_na_value(values, exp) tm.assert_series_equal(result, exp) def test_split_blank_string(any_string_dtype): # expand blank split GH 20067 values = Series([""], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) exp = DataFrame([[]], dtype=any_string_dtype) # NOTE: this is NOT an empty df tm.assert_frame_equal(result, exp) def test_split_blank_string_with_non_empty(any_string_dtype): values = Series(["a b c", "a b", "", " "], name="test", dtype=any_string_dtype) result = values.str.split(expand=True) exp = DataFrame( [ ["a", "b", "c"], ["a", "b", None], [None, None, None], [None, None, None], ], dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) @pytest.mark.parametrize("method", ["split", "rsplit"]) def test_split_noargs(any_string_dtype, method): # #1859 s = Series(["Wes McKinney", "Travis Oliphant"], dtype=any_string_dtype) result = getattr(s.str, method)() expected = ["Travis", "Oliphant"] assert result[1] == expected @pytest.mark.parametrize( "data, pat", [ (["bd asdf jfg", "kjasdflqw asdfnfk"], None), (["bd asdf jfg", "kjasdflqw asdfnfk"], "asdf"), (["bd_asdf_jfg", "kjasdflqw_asdfnfk"], "_"), ], ) @pytest.mark.parametrize("n", [-1, 0]) def test_split_maxsplit(data, pat, any_string_dtype, n): # re.split 0, str.split -1 s = Series(data, dtype=any_string_dtype) result = s.str.split(pat=pat, n=n) xp = s.str.split(pat=pat) tm.assert_series_equal(result, xp) @pytest.mark.parametrize( "data, pat, expected_val", [ ( ["split once", "split once too!"], None, "once too!", ), ( ["split_once", "split_once_too!"], "_", "once_too!", ), ], ) def test_split_no_pat_with_nonzero_n(data, pat, expected_val, any_string_dtype): s = Series(data, dtype=any_string_dtype) result = s.str.split(pat=pat, n=1) expected = Series({0: ["split", "once"], 1: ["split", expected_val]}) tm.assert_series_equal(expected, result, check_index_type=False) def test_split_to_dataframe_no_splits(any_string_dtype): s = Series(["nosplit", "alsonosplit"], dtype=any_string_dtype) result = s.str.split("_", expand=True) exp = DataFrame({0: Series(["nosplit", "alsonosplit"], dtype=any_string_dtype)}) tm.assert_frame_equal(result, exp) def test_split_to_dataframe(any_string_dtype): s = Series(["some_equal_splits", "with_no_nans"], dtype=any_string_dtype) result = s.str.split("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["equal", "no"], 2: ["splits", "nans"]}, dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) def test_split_to_dataframe_unequal_splits(any_string_dtype): s = Series( ["some_unequal_splits", "one_of_these_things_is_not"], dtype=any_string_dtype ) result = s.str.split("_", expand=True) exp = DataFrame( { 0: ["some", "one"], 1: ["unequal", "of"], 2: ["splits", "these"], 3: [None, "things"], 4: [None, "is"], 5: [None, "not"], }, dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) def test_split_to_dataframe_with_index(any_string_dtype): s = Series( ["some_splits", "with_index"], index=["preserve", "me"], dtype=any_string_dtype ) result = s.str.split("_", expand=True) exp = DataFrame( {0: ["some", "with"], 1: ["splits", "index"]}, index=["preserve", "me"], dtype=any_string_dtype, ) tm.assert_frame_equal(result, exp) with pytest.raises(ValueError, match="expand must be"): s.str.split("_", expand="not_a_boolean") def test_split_to_multiindex_expand_no_splits(): # https://github.com/pandas-dev/pandas/issues/23677 idx = Index(["nosplit", "alsonosplit", np.nan]) result = idx.str.split("_", expand=True) exp = idx tm.assert_index_equal(result, exp) assert result.nlevels == 1 def test_split_to_multiindex_expand(): idx = Index(["some_equal_splits", "with_no_nans", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( [ ("some", "equal", "splits"), ("with", "no", "nans"), [np.nan, np.nan, np.nan], [None, None, None], ] ) tm.assert_index_equal(result, exp) assert result.nlevels == 3 def test_split_to_multiindex_expand_unequal_splits(): idx = Index(["some_unequal_splits", "one_of_these_things_is_not", np.nan, None]) result = idx.str.split("_", expand=True) exp = MultiIndex.from_tuples( [ ("some", "unequal", "splits", np.nan, np.nan, np.nan), ("one", "of", "these", "things", "is", "not"), (np.nan, np.nan, np.nan, np.nan, np.nan, np.nan), (None, None, None, None, None, None), ] ) tm.assert_index_equal(result, exp) assert result.nlevels == 6 with pytest.raises(ValueError, match="expand must be"): idx.str.split("_", expand="not_a_boolean") @pytest.mark.parametrize( "pat, expected_data", [ (r"a(?=b)", [["aa"], ["", "b"], ["ba"], ["bb"]]), (r"(?<=a)b", [["aa"], ["a", ""], ["ba"], ["bb"]]), (r"a(?!b)", [["", "", ""], ["ab"], ["b", ""], ["bb"]]), (r"(?