Time-series search with early stopping












0















I want to search for a pattern in a time-series while either ignoring the mean/shift/bias or the scale/standard deviation.



Consequently, I've written two functions.



The first function searches passes through the time-series, incrementally calculating the mean for each search-space sub-sequence and using this mean to normalize the sub-sequence before comparing it to a normalized query.



function euc_dist(data::Vector{Float64}, query::Vector{Float64}, current_best::Float64)::Float64
sum = 0

for (dd, qq) in zip(data, query)
sum += (dd - qq) ^ 2
if sum >= current_best
break
end
end

return sum
end


function run_ignore_bias(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
m = length(query)

# normalize query in same manner data sub-sequence will be normalized
query = query .- (sum(query) / m)

current_best = Inf
loc = -1

# Keep current data in a double-size array to avoid using modulo
# Basically, the data is stored twice and weird indexing arithmetic is used to avoid
# using a LIFO queue and negative indexing.
# Computational efficiency benefit unclear.
t = zeros(Float64, 2*m)
tz = zeros(Float64, m)

run_sum = 0.
run_sum2 = 0.

for (d_i, dat) in enumerate(data)
run_sum += dat
run_sum2 += dat ^ 2

t_idx = ((d_i - 1) % m) + 1
t[t_idx] = dat
t[t_idx + m] = dat

if d_i >= m
run_mean = run_sum / m

# offset for search-space data
s_off = (d_i % m) + 1
# offset for search-space bound data
s_bound_off = (d_i - 1) - (m - 1) + 1

tz = t[s_off:s_off + m - 1] .- run_mean
dist = euc_dist(tz, query, current_best)

if dist < current_best
current_best = dist
loc = s_bound_off
end

run_sum -= t[s_off]
run_sum2 -= t[s_off] ^ 2
end
end

return sqrt(current_best), loc
end


The second function does the same, except it normalizes according to the standard deviation.



function run_ignore_scale(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
m = length(query)

# normalize scale query
q_mean = sum(query) / m
query = query / sqrt(sum(query.^2)/m - q_mean^2)
current_best = Inf
loc = -1

# Keep current data in a double-size array to avoid using modulo
# Basically, the data is stored twice and weird indexing arithmetic is used to avoid
# using a LIFO queue and negative indexing.
# Computational efficiency benefit unclear.
t = zeros(Float64, 2*m)
tz = zeros(Float64, m)

run_sum = 0.
run_sum2 = 0.

for (d_i, dat) in enumerate(data)
run_sum += dat
run_sum2 += dat ^ 2

t_idx = ((d_i - 1) % m) + 1
t[t_idx] = dat
t[t_idx + m] = dat

if d_i >= m
run_mean = run_sum / m
# occasionally, a floating point error can cause this value to be negative, thus take the absolute value before sqrt
run_std = sqrt(abs((run_sum2 / m) - (run_mean^2)))

# offset for search-space data
s_off = (d_i % m) + 1
# offset for search-space bound data
s_bound_off = (d_i - 1) - (m - 1) + 1

tz = t[s_off:s_off + m - 1] / run_std
dist = euc_dist(tz, query, current_best)
@assert dist > 0

if dist < current_best
current_best = dist
loc = s_bound_off
end

run_sum -= t[s_off]
run_sum2 -= t[s_off] ^ 2
end
end

return sqrt(current_best), loc
end


Here are the tests for both functions.



using Test

@testset "ignore bias" begin
sig = [.2, .3, .5, -.4, .2, .3]
data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
val, idx = run_ignore_bias(data, sig)
# should find shifted signal, but not scaled signal
@test idx == 3
@test isapprox(val, 0., atol=0.001)
end


@testset "ignore scale" begin
sig = [.2, .3, .5, -.4, .2, .3]
data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
val, idx = run_ignore_scale(data, sig)
# should find scaled signal, but not shifted
@test idx == 17
@test isapprox(val, 0., atol=0.001)
end


@testset "dist calc" begin
dist = euc_dist([1., 2., 3.], [4., 5., 6.], Inf)
@test isapprox(dist, 27.0, atol=0.001)

dist = euc_dist([1., 2., 3.], [4., 5., 6.], 8.)
@test isapprox(dist, 9.0, atol=0.001)
end


How do I reduce the code duplication between these two functions?










share|improve this question



























    0















    I want to search for a pattern in a time-series while either ignoring the mean/shift/bias or the scale/standard deviation.



    Consequently, I've written two functions.



    The first function searches passes through the time-series, incrementally calculating the mean for each search-space sub-sequence and using this mean to normalize the sub-sequence before comparing it to a normalized query.



    function euc_dist(data::Vector{Float64}, query::Vector{Float64}, current_best::Float64)::Float64
    sum = 0

    for (dd, qq) in zip(data, query)
    sum += (dd - qq) ^ 2
    if sum >= current_best
    break
    end
    end

    return sum
    end


    function run_ignore_bias(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
    m = length(query)

    # normalize query in same manner data sub-sequence will be normalized
    query = query .- (sum(query) / m)

    current_best = Inf
    loc = -1

    # Keep current data in a double-size array to avoid using modulo
    # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
    # using a LIFO queue and negative indexing.
    # Computational efficiency benefit unclear.
    t = zeros(Float64, 2*m)
    tz = zeros(Float64, m)

    run_sum = 0.
    run_sum2 = 0.

    for (d_i, dat) in enumerate(data)
    run_sum += dat
    run_sum2 += dat ^ 2

    t_idx = ((d_i - 1) % m) + 1
    t[t_idx] = dat
    t[t_idx + m] = dat

    if d_i >= m
    run_mean = run_sum / m

    # offset for search-space data
    s_off = (d_i % m) + 1
    # offset for search-space bound data
    s_bound_off = (d_i - 1) - (m - 1) + 1

    tz = t[s_off:s_off + m - 1] .- run_mean
    dist = euc_dist(tz, query, current_best)

    if dist < current_best
    current_best = dist
    loc = s_bound_off
    end

    run_sum -= t[s_off]
    run_sum2 -= t[s_off] ^ 2
    end
    end

    return sqrt(current_best), loc
    end


    The second function does the same, except it normalizes according to the standard deviation.



    function run_ignore_scale(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
    m = length(query)

    # normalize scale query
    q_mean = sum(query) / m
    query = query / sqrt(sum(query.^2)/m - q_mean^2)
    current_best = Inf
    loc = -1

    # Keep current data in a double-size array to avoid using modulo
    # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
    # using a LIFO queue and negative indexing.
    # Computational efficiency benefit unclear.
    t = zeros(Float64, 2*m)
    tz = zeros(Float64, m)

    run_sum = 0.
    run_sum2 = 0.

    for (d_i, dat) in enumerate(data)
    run_sum += dat
    run_sum2 += dat ^ 2

    t_idx = ((d_i - 1) % m) + 1
    t[t_idx] = dat
    t[t_idx + m] = dat

    if d_i >= m
    run_mean = run_sum / m
    # occasionally, a floating point error can cause this value to be negative, thus take the absolute value before sqrt
    run_std = sqrt(abs((run_sum2 / m) - (run_mean^2)))

    # offset for search-space data
    s_off = (d_i % m) + 1
    # offset for search-space bound data
    s_bound_off = (d_i - 1) - (m - 1) + 1

    tz = t[s_off:s_off + m - 1] / run_std
    dist = euc_dist(tz, query, current_best)
    @assert dist > 0

    if dist < current_best
    current_best = dist
    loc = s_bound_off
    end

    run_sum -= t[s_off]
    run_sum2 -= t[s_off] ^ 2
    end
    end

    return sqrt(current_best), loc
    end


    Here are the tests for both functions.



    using Test

    @testset "ignore bias" begin
    sig = [.2, .3, .5, -.4, .2, .3]
    data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
    val, idx = run_ignore_bias(data, sig)
    # should find shifted signal, but not scaled signal
    @test idx == 3
    @test isapprox(val, 0., atol=0.001)
    end


    @testset "ignore scale" begin
    sig = [.2, .3, .5, -.4, .2, .3]
    data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
    val, idx = run_ignore_scale(data, sig)
    # should find scaled signal, but not shifted
    @test idx == 17
    @test isapprox(val, 0., atol=0.001)
    end


    @testset "dist calc" begin
    dist = euc_dist([1., 2., 3.], [4., 5., 6.], Inf)
    @test isapprox(dist, 27.0, atol=0.001)

    dist = euc_dist([1., 2., 3.], [4., 5., 6.], 8.)
    @test isapprox(dist, 9.0, atol=0.001)
    end


    How do I reduce the code duplication between these two functions?










    share|improve this question

























      0












      0








      0








      I want to search for a pattern in a time-series while either ignoring the mean/shift/bias or the scale/standard deviation.



      Consequently, I've written two functions.



      The first function searches passes through the time-series, incrementally calculating the mean for each search-space sub-sequence and using this mean to normalize the sub-sequence before comparing it to a normalized query.



      function euc_dist(data::Vector{Float64}, query::Vector{Float64}, current_best::Float64)::Float64
      sum = 0

      for (dd, qq) in zip(data, query)
      sum += (dd - qq) ^ 2
      if sum >= current_best
      break
      end
      end

      return sum
      end


      function run_ignore_bias(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
      m = length(query)

      # normalize query in same manner data sub-sequence will be normalized
      query = query .- (sum(query) / m)

      current_best = Inf
      loc = -1

      # Keep current data in a double-size array to avoid using modulo
      # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
      # using a LIFO queue and negative indexing.
      # Computational efficiency benefit unclear.
      t = zeros(Float64, 2*m)
      tz = zeros(Float64, m)

      run_sum = 0.
      run_sum2 = 0.

      for (d_i, dat) in enumerate(data)
      run_sum += dat
      run_sum2 += dat ^ 2

      t_idx = ((d_i - 1) % m) + 1
      t[t_idx] = dat
      t[t_idx + m] = dat

      if d_i >= m
      run_mean = run_sum / m

      # offset for search-space data
      s_off = (d_i % m) + 1
      # offset for search-space bound data
      s_bound_off = (d_i - 1) - (m - 1) + 1

      tz = t[s_off:s_off + m - 1] .- run_mean
      dist = euc_dist(tz, query, current_best)

      if dist < current_best
      current_best = dist
      loc = s_bound_off
      end

      run_sum -= t[s_off]
      run_sum2 -= t[s_off] ^ 2
      end
      end

      return sqrt(current_best), loc
      end


      The second function does the same, except it normalizes according to the standard deviation.



      function run_ignore_scale(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
      m = length(query)

      # normalize scale query
      q_mean = sum(query) / m
      query = query / sqrt(sum(query.^2)/m - q_mean^2)
      current_best = Inf
      loc = -1

      # Keep current data in a double-size array to avoid using modulo
      # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
      # using a LIFO queue and negative indexing.
      # Computational efficiency benefit unclear.
      t = zeros(Float64, 2*m)
      tz = zeros(Float64, m)

      run_sum = 0.
      run_sum2 = 0.

      for (d_i, dat) in enumerate(data)
      run_sum += dat
      run_sum2 += dat ^ 2

      t_idx = ((d_i - 1) % m) + 1
      t[t_idx] = dat
      t[t_idx + m] = dat

      if d_i >= m
      run_mean = run_sum / m
      # occasionally, a floating point error can cause this value to be negative, thus take the absolute value before sqrt
      run_std = sqrt(abs((run_sum2 / m) - (run_mean^2)))

      # offset for search-space data
      s_off = (d_i % m) + 1
      # offset for search-space bound data
      s_bound_off = (d_i - 1) - (m - 1) + 1

      tz = t[s_off:s_off + m - 1] / run_std
      dist = euc_dist(tz, query, current_best)
      @assert dist > 0

      if dist < current_best
      current_best = dist
      loc = s_bound_off
      end

      run_sum -= t[s_off]
      run_sum2 -= t[s_off] ^ 2
      end
      end

      return sqrt(current_best), loc
      end


      Here are the tests for both functions.



      using Test

      @testset "ignore bias" begin
      sig = [.2, .3, .5, -.4, .2, .3]
      data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
      val, idx = run_ignore_bias(data, sig)
      # should find shifted signal, but not scaled signal
      @test idx == 3
      @test isapprox(val, 0., atol=0.001)
      end


      @testset "ignore scale" begin
      sig = [.2, .3, .5, -.4, .2, .3]
      data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
      val, idx = run_ignore_scale(data, sig)
      # should find scaled signal, but not shifted
      @test idx == 17
      @test isapprox(val, 0., atol=0.001)
      end


      @testset "dist calc" begin
      dist = euc_dist([1., 2., 3.], [4., 5., 6.], Inf)
      @test isapprox(dist, 27.0, atol=0.001)

      dist = euc_dist([1., 2., 3.], [4., 5., 6.], 8.)
      @test isapprox(dist, 9.0, atol=0.001)
      end


      How do I reduce the code duplication between these two functions?










      share|improve this question














      I want to search for a pattern in a time-series while either ignoring the mean/shift/bias or the scale/standard deviation.



      Consequently, I've written two functions.



      The first function searches passes through the time-series, incrementally calculating the mean for each search-space sub-sequence and using this mean to normalize the sub-sequence before comparing it to a normalized query.



      function euc_dist(data::Vector{Float64}, query::Vector{Float64}, current_best::Float64)::Float64
      sum = 0

      for (dd, qq) in zip(data, query)
      sum += (dd - qq) ^ 2
      if sum >= current_best
      break
      end
      end

      return sum
      end


      function run_ignore_bias(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
      m = length(query)

      # normalize query in same manner data sub-sequence will be normalized
      query = query .- (sum(query) / m)

      current_best = Inf
      loc = -1

      # Keep current data in a double-size array to avoid using modulo
      # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
      # using a LIFO queue and negative indexing.
      # Computational efficiency benefit unclear.
      t = zeros(Float64, 2*m)
      tz = zeros(Float64, m)

      run_sum = 0.
      run_sum2 = 0.

      for (d_i, dat) in enumerate(data)
      run_sum += dat
      run_sum2 += dat ^ 2

      t_idx = ((d_i - 1) % m) + 1
      t[t_idx] = dat
      t[t_idx + m] = dat

      if d_i >= m
      run_mean = run_sum / m

      # offset for search-space data
      s_off = (d_i % m) + 1
      # offset for search-space bound data
      s_bound_off = (d_i - 1) - (m - 1) + 1

      tz = t[s_off:s_off + m - 1] .- run_mean
      dist = euc_dist(tz, query, current_best)

      if dist < current_best
      current_best = dist
      loc = s_bound_off
      end

      run_sum -= t[s_off]
      run_sum2 -= t[s_off] ^ 2
      end
      end

      return sqrt(current_best), loc
      end


      The second function does the same, except it normalizes according to the standard deviation.



      function run_ignore_scale(data::Vector{Float64}, query::Vector{Float64})::Tuple{Float64, Int}
      m = length(query)

      # normalize scale query
      q_mean = sum(query) / m
      query = query / sqrt(sum(query.^2)/m - q_mean^2)
      current_best = Inf
      loc = -1

      # Keep current data in a double-size array to avoid using modulo
      # Basically, the data is stored twice and weird indexing arithmetic is used to avoid
      # using a LIFO queue and negative indexing.
      # Computational efficiency benefit unclear.
      t = zeros(Float64, 2*m)
      tz = zeros(Float64, m)

      run_sum = 0.
      run_sum2 = 0.

      for (d_i, dat) in enumerate(data)
      run_sum += dat
      run_sum2 += dat ^ 2

      t_idx = ((d_i - 1) % m) + 1
      t[t_idx] = dat
      t[t_idx + m] = dat

      if d_i >= m
      run_mean = run_sum / m
      # occasionally, a floating point error can cause this value to be negative, thus take the absolute value before sqrt
      run_std = sqrt(abs((run_sum2 / m) - (run_mean^2)))

      # offset for search-space data
      s_off = (d_i % m) + 1
      # offset for search-space bound data
      s_bound_off = (d_i - 1) - (m - 1) + 1

      tz = t[s_off:s_off + m - 1] / run_std
      dist = euc_dist(tz, query, current_best)
      @assert dist > 0

      if dist < current_best
      current_best = dist
      loc = s_bound_off
      end

      run_sum -= t[s_off]
      run_sum2 -= t[s_off] ^ 2
      end
      end

      return sqrt(current_best), loc
      end


      Here are the tests for both functions.



      using Test

      @testset "ignore bias" begin
      sig = [.2, .3, .5, -.4, .2, .3]
      data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
      val, idx = run_ignore_bias(data, sig)
      # should find shifted signal, but not scaled signal
      @test idx == 3
      @test isapprox(val, 0., atol=0.001)
      end


      @testset "ignore scale" begin
      sig = [.2, .3, .5, -.4, .2, .3]
      data = vcat(zeros(2), sig .+ 1., zeros(8), 2*sig, zeros(4))
      val, idx = run_ignore_scale(data, sig)
      # should find scaled signal, but not shifted
      @test idx == 17
      @test isapprox(val, 0., atol=0.001)
      end


      @testset "dist calc" begin
      dist = euc_dist([1., 2., 3.], [4., 5., 6.], Inf)
      @test isapprox(dist, 27.0, atol=0.001)

      dist = euc_dist([1., 2., 3.], [4., 5., 6.], 8.)
      @test isapprox(dist, 9.0, atol=0.001)
      end


      How do I reduce the code duplication between these two functions?







      julia






      share|improve this question













      share|improve this question











      share|improve this question




      share|improve this question










      asked 14 mins ago









      Seanny123Seanny123

      5951827




      5951827






















          0






          active

          oldest

          votes











          Your Answer





          StackExchange.ifUsing("editor", function () {
          return StackExchange.using("mathjaxEditing", function () {
          StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
          StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
          });
          });
          }, "mathjax-editing");

          StackExchange.ifUsing("editor", function () {
          StackExchange.using("externalEditor", function () {
          StackExchange.using("snippets", function () {
          StackExchange.snippets.init();
          });
          });
          }, "code-snippets");

          StackExchange.ready(function() {
          var channelOptions = {
          tags: "".split(" "),
          id: "196"
          };
          initTagRenderer("".split(" "), "".split(" "), channelOptions);

          StackExchange.using("externalEditor", function() {
          // Have to fire editor after snippets, if snippets enabled
          if (StackExchange.settings.snippets.snippetsEnabled) {
          StackExchange.using("snippets", function() {
          createEditor();
          });
          }
          else {
          createEditor();
          }
          });

          function createEditor() {
          StackExchange.prepareEditor({
          heartbeatType: 'answer',
          autoActivateHeartbeat: false,
          convertImagesToLinks: false,
          noModals: true,
          showLowRepImageUploadWarning: true,
          reputationToPostImages: null,
          bindNavPrevention: true,
          postfix: "",
          imageUploader: {
          brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
          contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
          allowUrls: true
          },
          onDemand: true,
          discardSelector: ".discard-answer"
          ,immediatelyShowMarkdownHelp:true
          });


          }
          });














          draft saved

          draft discarded


















          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211336%2ftime-series-search-with-early-stopping%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown

























          0






          active

          oldest

          votes








          0






          active

          oldest

          votes









          active

          oldest

          votes






          active

          oldest

          votes
















          draft saved

          draft discarded




















































          Thanks for contributing an answer to Code Review Stack Exchange!


          • Please be sure to answer the question. Provide details and share your research!

          But avoid



          • Asking for help, clarification, or responding to other answers.

          • Making statements based on opinion; back them up with references or personal experience.


          Use MathJax to format equations. MathJax reference.


          To learn more, see our tips on writing great answers.




          draft saved


          draft discarded














          StackExchange.ready(
          function () {
          StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f211336%2ftime-series-search-with-early-stopping%23new-answer', 'question_page');
          }
          );

          Post as a guest















          Required, but never shown





















































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown

































          Required, but never shown














          Required, but never shown












          Required, but never shown







          Required, but never shown







          Popular posts from this blog

          404 Error Contact Form 7 ajax form submitting

          How to know if a Active Directory user can login interactively

          TypeError: fit_transform() missing 1 required positional argument: 'X'