Recently we faced a poor performance in one of our ETL routine and we found that it uses join on 2 tables and structure was very simple. Both table has pk index and sql is seeking right results. Only problem was with data size. Our table2 holds about 27 millions records and it joins to table1 that has only 10000 records. So there was cleanup missing that caused not to delete records from table2 :(.
So the task was to remove millions of unwanted rows. we can initiate delete but this huge amount can cause lot of redo and ultimately can lead to table lock by RDBMS. So the good way is to delete in chunks.
here is the test run
--======================
-- Creating First Table
--=======================
CREATE TABLE test1
(id INT NOT null);
ALTER TABLE test1 ADD CONSTRAINT pk_test1 PRIMARY KEY (id);
--Loading data to table 1
declare @i int
declare @rows_to_insert int
set @i = 0
set @rows_to_insert = 10000
while @i < @rows_to_insert
begin
INSERT INTO test1 VALUES (@i)
set @i = @i + 1
end
-- verifying load
select count(*) from test1;
--======================
-- Creating Second Table
--=======================
CREATE TABLE test2
(id INT NOT null);
ALTER TABLE test2 ADD CONSTRAINT pk_test2 PRIMARY KEY (id);
--Loading data, set @rows_to_insert for rows you want to load
declare @i int
declare @rows_to_insert int
set @i = 0
set @rows_to_insert = 200000
while @i < @rows_to_insert
begin
INSERT INTO test2 VALUES (@i)
set @i = @i + 1
END
-- verifying load
SELECT count(*) FROM test2;
-- Query to fetch records that exist in table2 but not in table1
SELECT test1.id AS id1,test2.id AS id2 FROM test2 t2
LEFT JOIN test1 t1 ON t1.id = t2.id
WHERE t1.id IS NULL;
-- Delete in chunk
WHILE @@rowcount > 0
BEGIN
DELETE TOP ( 5000 )
FROM TEST2
FROM test2 t2
LEFT JOIN test1 t1 ON t1.id = t2.id
WHERE t1.id IS NULL
END
So the task was to remove millions of unwanted rows. we can initiate delete but this huge amount can cause lot of redo and ultimately can lead to table lock by RDBMS. So the good way is to delete in chunks.
here is the test run
--======================
-- Creating First Table
--=======================
CREATE TABLE test1
(id INT NOT null);
ALTER TABLE test1 ADD CONSTRAINT pk_test1 PRIMARY KEY (id);
--Loading data to table 1
declare @i int
declare @rows_to_insert int
set @i = 0
set @rows_to_insert = 10000
while @i < @rows_to_insert
begin
INSERT INTO test1 VALUES (@i)
set @i = @i + 1
end
-- verifying load
select count(*) from test1;
--======================
-- Creating Second Table
--=======================
CREATE TABLE test2
(id INT NOT null);
ALTER TABLE test2 ADD CONSTRAINT pk_test2 PRIMARY KEY (id);
--Loading data, set @rows_to_insert for rows you want to load
declare @i int
declare @rows_to_insert int
set @i = 0
set @rows_to_insert = 200000
while @i < @rows_to_insert
begin
INSERT INTO test2 VALUES (@i)
set @i = @i + 1
END
-- verifying load
SELECT count(*) FROM test2;
-- Query to fetch records that exist in table2 but not in table1
SELECT test1.id AS id1,test2.id AS id2 FROM test2 t2
LEFT JOIN test1 t1 ON t1.id = t2.id
WHERE t1.id IS NULL;
-- Delete in chunk
WHILE @@rowcount > 0
BEGIN
DELETE TOP ( 5000 )
FROM TEST2
FROM test2 t2
LEFT JOIN test1 t1 ON t1.id = t2.id
WHERE t1.id IS NULL
END