郵便番号データベースの分割レコードのマージ方法を比較する

2010年10月11日と2010年10月31日との記事で郵便番号データベースの分割レコードをマージする方法を二つ紹介しました.今回はその方法を比較検証します.

  1. Flag13が0の複数レコードを抽出してマージ
  2. 開き括弧と閉じ括弧の対応で抽出してマージ

結論から申し上げると,後者の方がより正確にマージできます.


USE ZIPDB;
GO
SELECT * INTO #K1 FROM dbo.KEN_ALL;
SELECT * INTO #K2 FROM dbo.KEN_ALL;
GO
WITH CTE (MINID, ZIPCODE) AS
(
SELECT MIN(ID), ZIPCode
FROM #K1
WHERE Flag13 = '0'
GROUP BY ZIPCode
HAVING COUNT(ID) > 1
),
CTE1 (ID, ZIPCODE, AREA, AREA_kana, DATALEVEL) AS
(
SELECT T1.ID, T1.ZIPCode, T1.AREA, T1.AREA_kana, 1
FROM CTE INNER JOIN #K1 AS T1
ON CTE.ZIPCODE = T1.ZIPCode
WHERE T1.Flag13 = '0'
UNION ALL
SELECT T2.ID, T2.ZIPCode, CTE1.AREA + T2.AREA, CTE1.AREA_kana + T2.AREA_kana, CTE1.DATALEVEL + 1
FROM CTE1 INNER JOIN #K1 AS T2
ON CTE1.ZIPCODE = T2.ZIPCode
AND CTE1.ID < T2.ID
WHERE T2.Flag13 = '0'
),
CTE2 (ZIPCODE, MAXLEVEL) AS
(
SELECT ZIPCODE, MAX(DATALEVEL)
FROM CTE1
GROUP BY ZIPCODE
),
CTE3 (ID, ZIPCODE, AREA, AREA_kana, DATALEVEL) AS
(
SELECT CTE1.ID, CTE1.ZIPCODE, CTE1.AREA, CTE1.AREA_kana, CTE1.DATALEVEL
FROM CTE2 INNER JOIN CTE1
ON CTE2.ZIPCODE = CTE1.ZIPCODE
AND CTE2.MAXLEVEL = CTE1.DATALEVEL
)
UPDATE #K1
SET AREA = CTE3.AREA, AREA_kana = CTE3.AREA_kana
FROM #K1 INNER JOIN CTE3
ON #K1.ZIPCode = CTE3.ZIPCODE
WHERE #K1.Flag13 = '0';
GO
WITH START_ID (startid) AS
(
SELECT ID
FROM #K2
WHERE AREA LIKE '%(%'
AND AREA NOT LIKE '%)%'
),
END_ID (endid) AS
(
SELECT ID
FROM #K2
WHERE AREA NOT LIKE '%(%'
AND AREA LIKE '%)%'
),
CTE (startid, endid, diff) AS
(
SELECT START_ID.startid, END_ID.endid, END_ID.endid - START_ID.startid
FROM START_ID INNER JOIN END_ID
ON START_ID.startid < END_ID.endid
),
CTE1 (startid, mindiff) AS
(
SELECT startid, MIN(diff)
FROM CTE
GROUP BY startid
),
CTE2 (startid, endid, diff) AS
(
SELECT CTE.startid, CTE.endid, CTE.diff
FROM CTE1 INNER JOIN CTE
ON CTE1.startid = CTE.startid
AND CTE1.mindiff = CTE.diff
)
SELECT CTE2.startid, CTE2.endid, #K2.*
INTO #CTE3
FROM CTE2 INNER JOIN #K2
ON #K2.ID BETWEEN CTE2.startid AND CTE2.endid;

WITH CTE4 (ID, startid, endid, DATALEVEL, AREA, AREA_kana) AS
(
SELECT ID, startid, endid, 0, AREA, AREA_kana
FROM #CTE3
WHERE ID = startid
UNION ALL
SELECT #CTE3.ID, #CTE3.startid, #CTE3.endid, DATALEVEL + 1, CTE4.AREA + #CTE3.AREA, CTE4.AREA_kana + #CTE3.AREA_kana
FROM #CTE3 INNER JOIN CTE4
ON CTE4.startid = #CTE3.startid
AND CTE4.ID < #CTE3.ID
),
CTE5 (startid, MAXLEVEL) AS
(
SELECT startid, MAX(DATALEVEL)
FROM CTE4
GROUP BY startid
),
CTE6 (ID, startid, endid, DATALEVEL, AREA, AREA_kana) AS
(
SELECT CTE4.ID, CTE4.startid, CTE4.endid, CTE4.DATALEVEL, CTE4.AREA, CTE4.AREA_kana
FROM CTE4 INNER JOIN CTE5
ON CTE4.startid = CTE5.startid
AND CTE4.DATALEVEL = CTE5.MAXLEVEL
)
UPDATE #K2
SET AREA = CTE6.AREA, AREA_kana = CTE6.AREA_kana
FROM #K2 INNER JOIN CTE6
ON #K2.ID BETWEEN CTE6.startid AND CTE6.endid;
GO
DELETE FROM #K1
WHERE #K1.ID > (SELECT MIN(ID)
FROM #K1 AS K1
WHERE #K1.ZIPCODE = K1.ZIPCODE
AND #K1.AREA = K1.AREA);

DELETE FROM #K2
WHERE #K2.ID > (SELECT MIN(ID)
FROM #K2 AS K2
WHERE #K2.ZIPCODE = K2.ZIPCODE
AND #K2.AREA = K2.AREA);
GO
SELECT ID, ZIPcode, LGcode, Flag13, AREA FROM #K1
EXCEPT
SELECT ID, ZIPcode, LGcode, Flag13, AREA FROM #K2;

SELECT ID, ZIPcode, LGcode, Flag13, AREA FROM #K2
EXCEPT
SELECT ID, ZIPcode, LGcode, Flag13, AREA FROM #K1;

(122943 行処理されました)

(122943 行処理されました)

(550 行処理されました)

(554 行処理されました)

(554 行処理されました)

(466 行処理されました)

(468 行処理されました)

(4 行処理されました)

(2 行処理されました)

ID ZIPcode LGcode Flag13 AREA
84435 5900111 27142 1 三原台(2丁11−7・9〜17・21、3丁59−2〜18・21〜25、
84436 5900111 27142 1 4丁60−1・107−3〜12・16〜21)
118567 8710046 44203 1 金谷(上ノ丁、中ノ丁、西ノ丁、本町、南ノ丁、
118568 8710046 44203 1 森ノ丁、山ノ神)

ID ZIPcode LGcode Flag13 AREA
84435 5900111 27142 1 三原台(2丁11−7・9〜17・21、3丁59−2〜18・21〜25、4丁60−1・107−3〜12・16〜21)
118567 8710046 44203 1 金谷(上ノ丁、中ノ丁、西ノ丁、本町、南ノ丁、森ノ丁、山ノ神)