Tricky update query 
Author Message
 Tricky update query

I am trying to update a table that has millions of rows with the
results of a calculated value from another table of slightly smaller
size.  The update statement is taking a very long time, which doesn't
surprise me given the amount of data, but i can't help but feel like
it could be faster if written optimally.

Here is what i am doing now (slightly modified, so obviously the
calculated numbers appear completely meaningless)

I think my major bottleneck is that I need to exclude the top n
observations out of my Sec_P table based on the number of days in my
historical_val table.  since i am using absolute value, it seems like
this subquery is taking the longest time.  any ideas on this one?

use pubs
go

CREATE TABLE [Historical_Val] (
        [ID] [int] NOT NULL ,
        [Date] [smalldatetime] NOT NULL ,
        [Symbol] [varchar] (32) COLLATE SQL_Latin1_General_CP1_CI_AS NULL ,
        [Days] [int] NOT NULL ,
        [Val] [float] NULL ,
        [F_Val] [float] NULL ,
        CONSTRAINT [PK_Historical_Val] PRIMARY KEY  CLUSTERED
        (
                [ID],
                [Date],
                [Days]
        ) WITH  FILLFACTOR = 90  ON [PRIMARY]
) ON [PRIMARY]
GO

INSERT INTO Historical_Val
VALUES
(1, '1/14/2002', 'A', 14, 1.0, NULL)

CREATE TABLE [SEC_P] (
        [ID] [int] NOT NULL ,
        [Date] [smalldatetime] NOT NULL ,
        [SomeVal] [float] NOT NULL ,
        CONSTRAINT [PK_SEC_P] PRIMARY KEY  CLUSTERED
        (
                [ID],
                [Date]
        ) WITH  FILLFACTOR = 90  ON [PRIMARY]
) ON [PRIMARY]
GO

INSERT INTO Sec_P
VALUES
(1, '1/1/2002', 10.0)

INSERT INTO Sec_P
VALUES
(1, '1/2/2002', 10.0)

INSERT INTO Sec_P
VALUES
(1, '1/3/2002', 10.0)

INSERT INTO Sec_P
VALUES
(1, '1/4/2002', 10.0)

select * from sec_p
where date between '1/1/2002' and '1/14/2002'
GO

UPDATE Historical_Val
SET F_Val =
        CASE

        WHEN Historical_Val.[Days] = 14
        THEN
        (SELECT 100* SUM(p.[SomeVal])
        FROM Sec_P p
        WHERE p.[ID] = Historical_Val.[ID] AND p.[Date] BETWEEN
Historical_Val.[Date] - 13 AND Historical_Val.[Date]
        AND p.SomeVal > -1.0
        AND p.[Date]
        NOT IN
        (SELECT TOP 1 [Date]
        FROM Sec_P sp
        WHERE sp.[ID] = p.[ID] AND sp.[Date] BETWEEN Historical_Val.[Date] -
13 AND Historical_Val.[Date]
        AND sp.SomeVal > -1.0
        ORDER BY ABS(SomeVal) DESC ))

        WHEN Historical_Val.[Days] = 30
        THEN
        (SELECT 100* SQRT(p.[SomeVal])
        FROM Sec_P p
        WHERE p.[ID] = Historical_Val.[ID] AND p.[Date] BETWEEN
Historical_Val.[Date] - 29 AND Historical_Val.[Date]
        AND p.SomeVal > -1.0
        AND p.[Date]
        NOT IN
        (SELECT TOP 2 [Date]
        FROM Sec_P sp
        WHERE sp.[ID] = p.[ID] AND sp.[Date] BETWEEN Historical_Val.[Date] -
29 AND Historical_Val.[Date]
        AND sp.SomeVal > -1.0
        ORDER BY ABS(SomeVal) DESC ))

        END

FROM
Historical_Val
WHERE
Historical_Val.[Date] BETWEEN '1/14/2002' AND '1/14/2002'

SELECT * FROM Historical_Val

DROP TABLE Historical_Val
DROP TABLE Sec_P



Tue, 22 Mar 2005 23:31:57 GMT
 Tricky update query

Hi

Here are couple of things you may wish to profile/consider:

1. Splitting the query down into two separate updates:

UPDATE Historical_Val
SET F_Val = (SELECT 100* SUM(p.[SomeVal])
        FROM Sec_P p
        WHERE p.[ID] = h.[ID]
        AND p.[Date] BETWEEN h.[Date] - 13 AND h.[Date]
        AND p.SomeVal > -1.0
        AND p.[Date]
        NOT IN
        (SELECT TOP 1 [Date]
                FROM Sec_P sp
                WHERE sp.[ID] = p.[ID]
                AND sp.[Date] BETWEEN h.[Date] - 13 AND h.[Date]
                AND sp.SomeVal > -1.0
                ORDER BY ABS(SomeVal) DESC ))
FROM Historical_Val h
WHERE h.[Days] = 14
AND h.[Date] BETWEEN '1/14/2002' AND '1/14/2002'

UPDATE Historical_Val
SET F_Val = (SELECT 100* SQRT(p.[SomeVal])
        FROM Sec_P p
        WHERE p.[ID] = h.[ID]
        AND p.[Date] BETWEEN h.[Date] - 29 AND h.[Date]
        AND p.SomeVal > -1.0
        AND p.[Date]
        NOT IN (SELECT TOP 2 [Date]
                FROM Sec_P sp
                WHERE sp.[ID] = p.[ID]
                AND sp.[Date] BETWEEN h.[Date] - 29 AND h.[Date]
                AND sp.SomeVal > -1.0
                ORDER BY ABS(SomeVal) DESC ))
FROM Historical_Val h
WHERE h.[Days] = 30
AND h.[Date] BETWEEN '1/14/2002' AND '1/14/2002'

2. In the subquery why use ABS if sp.SomeVal > -1.0,

3. In this case TOP 1 is just where SumVal = Max(SumVal)

4. The second update does not make a great deal of sense (to me) as
you can get multiple values, do you want SQRT (SUM(p.[SomeVal]))?

John

Quote:

> I am trying to update a table that has millions of rows with the
> results of a calculated value from another table of slightly smaller
> size.  The update statement is taking a very long time, which doesn't
> surprise me given the amount of data, but i can't help but feel like
> it could be faster if written optimally.

> Here is what i am doing now (slightly modified, so obviously the
> calculated numbers appear completely meaningless)

> I think my major bottleneck is that I need to exclude the top n
> observations out of my Sec_P table based on the number of days in my
> historical_val table.  since i am using absolute value, it seems like
> this subquery is taking the longest time.  any ideas on this one?

> use pubs
> go

> CREATE TABLE [Historical_Val] (
>    [ID] [int] NOT NULL ,
>    [Date] [smalldatetime] NOT NULL ,
>    [Symbol] [varchar] (32) COLLATE SQL_Latin1_General_CP1_CI_AS NULL ,
>    [Days] [int] NOT NULL ,
>    [Val] [float] NULL ,
>    [F_Val] [float] NULL ,
>    CONSTRAINT [PK_Historical_Val] PRIMARY KEY  CLUSTERED
>    (
>            [ID],
>            [Date],
>            [Days]
>    ) WITH  FILLFACTOR = 90  ON [PRIMARY]
> ) ON [PRIMARY]
> GO

> INSERT INTO Historical_Val
> VALUES
> (1, '1/14/2002', 'A', 14, 1.0, NULL)

> CREATE TABLE [SEC_P] (
>    [ID] [int] NOT NULL ,
>    [Date] [smalldatetime] NOT NULL ,
>    [SomeVal] [float] NOT NULL ,
>    CONSTRAINT [PK_SEC_P] PRIMARY KEY  CLUSTERED
>    (
>            [ID],
>            [Date]
>    ) WITH  FILLFACTOR = 90  ON [PRIMARY]
> ) ON [PRIMARY]
> GO

> INSERT INTO Sec_P
> VALUES
> (1, '1/1/2002', 10.0)

> INSERT INTO Sec_P
> VALUES
> (1, '1/2/2002', 10.0)

> INSERT INTO Sec_P
> VALUES
> (1, '1/3/2002', 10.0)

> INSERT INTO Sec_P
> VALUES
> (1, '1/4/2002', 10.0)

> select * from sec_p
> where date between '1/1/2002' and '1/14/2002'
> GO

> UPDATE Historical_Val
> SET F_Val =
>    CASE

>    WHEN Historical_Val.[Days] = 14
>    THEN
>    (SELECT 100* SUM(p.[SomeVal])
>    FROM Sec_P p
>    WHERE p.[ID] = Historical_Val.[ID] AND p.[Date] BETWEEN
> Historical_Val.[Date] - 13 AND Historical_Val.[Date]
>    AND p.SomeVal > -1.0
>    AND p.[Date]
>    NOT IN
>    (SELECT TOP 1 [Date]
>    FROM Sec_P sp
>    WHERE sp.[ID] = p.[ID] AND sp.[Date] BETWEEN Historical_Val.[Date] -
> 13 AND Historical_Val.[Date]
>    AND sp.SomeVal > -1.0
>    ORDER BY ABS(SomeVal) DESC ))

>    WHEN Historical_Val.[Days] = 30
>    THEN
>    (SELECT 100* SQRT(p.[SomeVal])
>    FROM Sec_P p
>    WHERE p.[ID] = Historical_Val.[ID] AND p.[Date] BETWEEN
> Historical_Val.[Date] - 29 AND Historical_Val.[Date]
>    AND p.SomeVal > -1.0
>    AND p.[Date]
>    NOT IN
>    (SELECT TOP 2 [Date]
>    FROM Sec_P sp
>    WHERE sp.[ID] = p.[ID] AND sp.[Date] BETWEEN Historical_Val.[Date] -
> 29 AND Historical_Val.[Date]
>    AND sp.SomeVal > -1.0
>    ORDER BY ABS(SomeVal) DESC ))

>    END

> FROM
> Historical_Val
> WHERE
> Historical_Val.[Date] BETWEEN '1/14/2002' AND '1/14/2002'

> SELECT * FROM Historical_Val

> DROP TABLE Historical_Val
> DROP TABLE Sec_P



Thu, 24 Mar 2005 04:47:32 GMT
 Tricky update query

Quote:

> I think my major bottleneck is that I need to exclude the top n
> observations out of my Sec_P table based on the number of days in my
> historical_val table.  since i am using absolute value, it seems like
> this subquery is taking the longest time.  any ideas on this one?

This was not an easy one. I've to admit that the ideas that I can provide
might not even be applicable at all.

* In the subquery you compare sec_p.date to Historical_val.date, but in
  the FROM clause for the UPDATE statement, you a range of one day only.
  If you say "p.Date BETWEEN '20020114' AND 20020101'" this might gain
  you some speed.

* In the first CASE branch, you could replace the TOP subquery by saying
  SUM(p.SomeVal) - MAX(abs(p.SomeVal)). But this only works when you have
  TOP 1, so you can not use it for the second one.

I might have some more ideas, but they are fairly vague. And they might
rely on that you are actually only performing the UPDATE for one value
of Historical_val.date.

There is one thing in the query which is inconsistent:

Quote:
>      WHEN Historical_Val.[Days] = 14
>      THEN
>       (SELECT 100* SUM(p.[SomeVal])
>...
>      WHEN Historical_Val.[Days] = 30
>      THEN
>       (SELECT 100* SQRT(p.[SomeVal])

First you have SUM, then you have SQRT.

--

I support PASS - the definitive, global community for SQL Server
professionals - http://www.sqlpass.org
The PASS Community Summit is in Seattle Nov 19-22, I'll be there. And you?



Thu, 24 Mar 2005 05:01:37 GMT
 Tricky update query
Quote:
>> I am trying to update a table that has millions of rows with the

results of a calculated value from another table of slightly smaller
size ...  I think my major bottleneck is that I need to exclude the
top (n) observations out of my Sec_P table based on the number of days
in my historical_val table. <<

I did not see a reason for the proprietary UPDATE ..FROM syntax, other
than making the code hard to read and non-portable.  DATE is a
reserved word in Standard SQL (and most other products!) so don't use
it as a column name -- and it is too vague to be any good.  Likewise
"days" is a unit of mesurement, not a data element name -- maybe "age"
would be better?  Look up the ISO 11179 Metadata rules; it make your
code a LOT easier to read, understand and maintain.

It looks like the thing should be:

 UPDATE Historical_Values
    SET f_val
        = CASE THEN age = 14
               THEN ...
               WHEN age = 30
               THEN ...
          ELSE NULL END
  WHERE event_date = '2002-01-14';

What are the spec in English for the ...'s?



Fri, 25 Mar 2005 05:45:46 GMT
 Tricky update query
I have got to get back to packing or I will never get moved!  

Got an idea which I have notgot time to test.  The problem is to
exclude the most recent one (or two) values from the calculations.
That means we can adjust the date range in the BETWEEN predicate,
based on a subquery, instead of trying to remove the value itself.  It
would look something like this for the 14-day case:

(SELECT 100 * SUM (P1.someval)
   FROM Sec_P AS P1
  WHERE P1.id = Historical_Values.id
    AND P1.someval > -1.0
    AND P1.event_date
        BETWEEN Historical_Values.event_date - 13
            AND (SELECT MAX(event_date) - 1  -- exclude most recent
data
                   FROM Sec_P AS P2
                  WHERE P1.id = P2.id
                    AND P2.someval > -1.0)

Not sure how to handle a single data point, but perhaps a COALESCE()
would help.



Fri, 25 Mar 2005 06:37:29 GMT
 Tricky update query
Thanks very much for looking at this one.  I noticed a couple of you
pointed out that the underlying math is not consistent from one
subsection of the CASE to the other.  That was just an oversight, the
actual query i am trying to use has math that makes much more sense,
and is consistent but i stripped it out and just tried to use SUM or
SQRT kind of as a placeholder for my proprietary formulae.

The reason I have only used one day in my WHERE clause was just for
simplicity, in the actual query, i will be using much longer date
ranges

i realize that Date is not a good choice for column names, it bugs me
too but changing it at this point is gonna be a project of it's own
since this table is in production and there are many stored procs that
reference it.  i didn't choose the column name, and have been using
[Date] with a fair amt of success so changing this column name has
been low on my priority list.  As for "Days", it is a property of the
underlying data that conveys a reasonable meaning to us, but I will
def. refer to the document you mentioned, thxn.

I didn't realize my Update statement had proprietary syntax, but it is
possible...i almost always prefer to use Delete From followed by
Inserts instead of Updates, so my Update skills are underdeveloped.
But in this case, I don't think I have a choice but to perform an
Update on this table.

The main problem that I am having is that the SELECT TOP n ORDER BY
ABS(SomeVal) is very slow, so it would def be great to get rid of it
as a couple of you have suggested, but let me explain a bit more in
english what I'm trying to accomplish.

The data in Sec_P has values for SomeVal that range from -1.0 to +
infinity.  By definition, a value of SomeVal that is <= - 1.0 is
meaningless and probably bad data, so I exclude those rows from this
update, because passing these dubious values into my Mathematical
equation might cause domain errors and other bad things, not to
mention providing misleading results.

However, since the values of SomeVal can, and often are between -1.0
and 0, I have to take the absolute value.  The main point of this
query is to remove outliers from the data that is used to calculate my
F_Val, so F_Val is meant to operate on the data from Sec_P for a given
ID, [Date], and Days, with the largest observations of SomeVal (by
magnitude hence the ABS)removed to smooth the results.  Val uses the
same formula, but doesn't filter the results so it is much more
straightforward.

I also only included the Days = 14 and Days = 30 pieces of the CASE
statement to keep it shorter, but there are a few others and these
SELECT TOP 5 FROM Sec_P etc, so SELECT MAX instead of SELECT TOP n
def. wouldn't work

Please let me know if I can provide any further information, and I
really do appreciate you guys looking at this one.

Mike



Sat, 26 Mar 2005 00:04:11 GMT
 Tricky update query
Well, if anyone still cares about this one, i was able to speed this
query up quite a bit by filtering my subquery where i used to say
p.[Date] IN, now i am filtering the actual values.  the reason i think
this is ok is that i don't care what date the value corresponds to, i
just want to get that value out of there.  i think that this will not
be accurate if the n + 1 observation is = the n observation because
then i will be filtering more observations than i actual want, but
given the nature of the data in question (they are all floats that are
the result of calculations) i don't expect this to happen too often
and the slight inaccuracy is probably worth the performance
improvement.  is this just really bad form?

revised query:

use pubs
go

CREATE TABLE [Historical_Val] (
        [ID] [int] NOT NULL ,
        [Date] [smalldatetime] NOT NULL ,
        [Symbol] [varchar] (32) COLLATE SQL_Latin1_General_CP1_CI_AS NULL ,
        [Days] [int] NOT NULL ,
        [Val] [float] NULL ,
        [F_Val] [float] NULL ,
        CONSTRAINT [PK_Historical_Val] PRIMARY KEY  CLUSTERED
        (
                [ID],
                [Date],
                [Days]
        ) WITH  FILLFACTOR = 90  ON [PRIMARY]
) ON [PRIMARY]
GO

INSERT INTO Historical_Val
VALUES
(1, '1/14/2002', 'A', 14, 1.0, NULL)

CREATE TABLE [SEC_P] (
        [ID] [int] NOT NULL ,
        [Date] [smalldatetime] NOT NULL ,
        [SomeVal] [float] NOT NULL ,
        CONSTRAINT [PK_SEC_P] PRIMARY KEY  CLUSTERED
        (
                [ID],
                [Date]
        ) WITH  FILLFACTOR = 90  ON [PRIMARY]
) ON [PRIMARY]
GO

INSERT INTO Sec_P
VALUES
(1, '1/1/2002', 10.0)

INSERT INTO Sec_P
VALUES
(1, '1/2/2002', 9.0)

INSERT INTO Sec_P
VALUES
(1, '1/3/2002', 8.0)

INSERT INTO Sec_P
VALUES
(1, '1/4/2002', 7.0)

select * from sec_p
where date between '1/1/2002' and '1/14/2002'
GO

UPDATE Historical_Val
SET F_Val =
        CASE

        WHEN Historical_Val.[Days] = 14
        THEN
        (SELECT 100* SQRT(SUM(p.[SomeVal]))
        FROM Sec_P p
        WHERE p.[ID] = Historical_Val.[ID] AND   p.[Date] >
Historical_Val.[Date] - Historical_Val.Days
                AND p.[Date] < = Historical_Val.[Date] AND p.SomeVal > -1.0
        AND ABS(p.SomeVal) <
                (SELECT MIN(d.ABSSomeVal)
                FROM (SELECT TOP 1 ABS(SomeVal) AS ABSSomeVal FROM  Sec_P sp
                WHERE sp.[ID] = p.[ID] AND sp.[Date] > Historical_Val.[Date] -
Historical_Val.Days
                AND sp.[Date] < = Historical_Val.[Date] AND sp.SomeVal > -1.0
        ORDER BY ABS(SomeVal) DESC ) AS d))

        WHEN Historical_Val.[Days] = 30
        THEN
        (SELECT 100* SQRT(SUM(p.[SomeVal]))
        FROM Sec_P p
        WHERE p.[ID] = Historical_Val.[ID] AND p.[Date] >
Historical_Val.[Date] - Historical_Val.Days
                AND p.[Date] < = Historical_Val.[Date] AND p.SomeVal > -1.0
        AND ABS(p.SomeVal) <
                (SELECT MIN(d.ABSSomeVal)
                FROM (SELECT TOP 2 ABS(SomeVal) AS ABSSomeVal FROM  Sec_P sp
                WHERE sp.[ID] = p.[ID] AND sp.[Date] > Historical_Val.[Date] -
Historical_Val.Days
                AND sp.[Date] < = Historical_Val.[Date] AND sp.SomeVal > -1.0
        ORDER BY ABS(SomeVal) DESC ) AS d))

        END

WHERE
Historical_Val.[Date] BETWEEN '1/13/2002' AND '1/14/2002'

SELECT * FROM Historical_Val

DROP TABLE Historical_Val
DROP TABLE Sec_P



Sun, 27 Mar 2005 00:24:34 GMT
 Tricky update query
sorry, this was supposed to be part of the thread Tricky Update Query,
not a new one.  apologies for the mis-post


Sun, 27 Mar 2005 21:40:22 GMT
 
 [ 8 post ] 

 Relevant Pages 

1. tricky update query

2. Tricky UPDATE / Select SQL query problem.

3. Need help with a Tricky Update statement

4. Tricky Update

5. Tricky Update statement need help please...

6. Tricky update-puzzle

7. Tricky update statement

8. A tricky update close to be dangerous

9. Help with Tricky UPDATE sql statements.

10. Tricky Update SQL problem

11. Tricky Update Statement

12. A tricky update...


 
Powered by phpBB® Forum Software