Skip to content

SQLGraphExplorer‐7

Xin edited this page Dec 26, 2023 · 27 revisions

理论的完善

论文中给出的公式包括如下关系代数运算符:

$$\sigma , \Pi , \bowtie , \cup , \cap$$

还缺少:

$$\rho(rename), \\\ -(difference),但论文中明确说了该运算符无法逆运算$$

ρ : rename

$$\widetilde{E}(\mathcal{P})=\widetilde{E'}(Remap(\mathcal{P},X,sch(E')))\\\ E=\rho_X(E'),\\\ \mathcal{P}=\{p_1,p_2,...\}, \\\ p_i=[N,C]=[(t),C]=[(t_1,t_2,...,t_m),C]$$ $$X=\{dst_1,dst_2,...,dst_n\}\\\ sch(E')=\{src_1,src_2,...,src_n\}\\\ Remap(\mathcal{P},X,sch(E'))=\{remap(p,X,sch(E'))|p\in\mathcal{P}\}\\\ remap(p,X,sch(E'))=[(t_i.src_i),C]$$

example:

-- sch(A)=A1,A2,A3, sch(C)=C1,C2
SELECT
A.A1 AS C1,
A.A2 AS C2
FROM A
$$E=\rho_{(C1,C2)}(\Pi_{A1,A2}(A))$$

假设结果为

$$\mathcal{P}=\{[(c1,c2),\top]\}$$

推导过程:

$$\begin{align} & \widetilde{E}(\mathcal{P}) \\\ & =\widetilde{E_1}(Remap(\mathcal{P},X,sch(E_1))) & E_1=\Pi_{A1,A2}(A) \\\ & =\widetilde{E_1}([c1,c2],\top) & \\\ & =\widetilde{A}(Ext(\mathcal{P},sch(A))) & \\\ & =[(c1,c2,A3),\top] \end{align}$$
-- 结果对应语句
SELECT A1,A2,A3 FROM A WHERE A1=c1, A2=c2

sum of product(sop)

有三个目的:

  1. 简化布尔代数:(B=b AND B=b) -> (B=b)
  2. 理论需要:上次说的模糊之处,selection 的运算需要将过滤条件标准化成 sop
  3. 冗余过滤:详见下一小节

冗余问题

由于多表选择的原因,反向选择的结果存在冗余的 WHERE 过滤条件,详见test-case-1的结果:

-- A1
SELECT
  A11,
  A12,
  A13
FROM
  A1
WHERE
  A1.A11 = gh
  AND A1.A12 = xm
  AND A1.A13 = gj

  AND LENGTH(NVL(A1.A11,'')) > 0
  AND A1.A14 <= '#{ETL_DT}'
  AND A1.A15 > '#{ETL_DT}'

  AND A1.B11 = B.B01
  AND B.B02 <= '#{ETL_DT}'
  AND B.B03 > '#{ETL_DT}'

上面可见:明明是针对 A1 表的选择,在 WHERE 过滤条件中却出现了不相干的表 B,不妨称其为冗余问题

没有找到解决此问题的相关论文,但是有个“拍脑门”的明显的解决方法:直接将所有“存在无关表”的最小布尔表达式设置为 TRUE:

-- A1
SELECT
  A11,
  A12,
  A13
FROM
  A1
WHERE
  A1.A11 = gh
  AND A1.A12 = xm
  AND A1.A13 = gj

  AND LENGTH(NVL(A1.A11,'')) > 0
  AND A1.A14 <= '#{ETL_DT}'
  AND A1.A15 > '#{ETL_DT}'

  AND TRUE
  AND TRUE
  AND TRUE

然后使用上一小节的简化公式进行消除即可:

-- A1
SELECT
  A11,
  A12,
  A13
FROM
  A1
WHERE
  A1.A11 = gh
  AND A1.A12 = xm
  AND A1.A13 = gj

  AND LENGTH(NVL(A1.A11,'')) > 0
  AND A1.A14 <= '#{ETL_DT}'
  AND A1.A15 > '#{ETL_DT}'

Test-Case

test-case-1

-- sch(A1)=A11,A12,A13,A14,A15,A16,B11
-- sch(A2)=A21,A22,A23,A24,A25,A26,B21
-- sch(B)= B01,B02,B03,B04
-- sch(C)=GH,XM,GJ
SELECT
  A1.A11 AS GH, /*工号*/
  A1.A12 AS XM, /*姓名*/
  A1.A13 AS GJ  /*国籍*/
FROM
  A1 /*在职人员基本信息*/
  JOIN B /*机构基本情况*/
  ON A1.B11 = B.B01
  AND B.B02 <= '#{ETL_DT}'
  AND B.B03 > '#{ETL_DT}'
WHERE
  LENGTH (NVL (A1.A11, '')) > 0 /*员工编号有值的数据*/
  AND A1.A14 <= '#{ETL_DT}'
  AND A1.A15 > '#{ETL_DT}'

UNION ALL --从减员人员基本信息表中获取除了状态为'非正常在岗'的人员信息

SELECT
  A2.A21 AS GH, /*工号*/
  A2.A22 AS XM, /*姓名*/
  A2.A23 AS GJ  /*国籍*/
FROM
  A2
  JOIN B /*机构基本情况*/
  ON A2.B21 = B.B01
  AND B.B02 <= '#{ETL_DT}'
  AND B.B03 > '#{ETL_DT}'
WHERE
  LENGTH (NVL (A2.A21, '')) > 0 /*员工编号有值的数据*/
  AND A2.A24 <= '#{ETL_DT}'
  AND A2.A25 > '#{ETL_DT}'
  AND A2.A26 <> '0101';
$$E=\Pi_{A11,A12,A13}(\sigma_{selection1}(\sigma_{selection2}(A1\bowtie B)))\cup\Pi_{A21,A22,A23}(\sigma_{selection3}(\sigma_{selection4}(A2\bowtie B)))$$
-- selection1=
LENGTH(NVL(A1.A11,'')) > 0
AND A1.A14 <= '#{ETL_DT}'
AND A1.A15 > '#{ETL_DT}'
-- selection2=
A1.B11 = B.B01
AND B.B02 <= '#{ETL_DT}'
AND B.B03 > '#{ETL_DT}'
-- selection3=
LENGTH(NVL(A2.A21,'')) > 0
AND A2.A24 <= '#{ETL_DT}'
AND A2.A25 > '#{ETL_DT}'
AND A2.A26 <> '0101';
-- selection4=
A2.B21 = B.B01
AND B.B02 <= '#{ETL_DT}'
AND B.B03 > '#{ETL_DT}'

假设执行结果是

image-20231208134310383

推导过程如下:

$$ \begin{align} & \widetilde{E}(\mathcal{P}) \ & =\widetilde{E_1}(\mathcal{P})\diamond \widetilde{E_2}(\mathcal{P}) \ & E_1=\Pi_{A11,A12,A13}(\sigma_{selection1}(\sigma_{selection2}(A1\bowtie B))) \ & E_2=\Pi_{A21,A22,A23}(\sigma_{selection3}(\sigma_{selection4}(A2\bowtie B))) \ \

& =\widetilde{E_5}([tuple_3, \top \wedge selection1])\diamond \widetilde{E_6}([tuple_4,\top \wedge selection3]) \ & E_5=\sigma_{selection2}(A1\bowtie B) \ & E_6=\sigma_{selection4}(A2\bowtie B) \ \

& =\widetilde{E_7}([tuple_3,\top \wedge selection1 \wedge selection2]) \diamond \widetilde{E_8}([tuple_4,\top \wedge selection3 \wedge condtion4]) \ & E_7=A1\bowtie B \ & E_8=A2\bowtie B \ \

& ={\widetilde{A1}(Proj(\mathcal{P}_1, sch(A1))) \odot \widetilde{B}(Proj(\mathcal{P}_1, sch(B)))} \ & \diamond {\widetilde{A2}(Proj(\mathcal{P}_2, sch(A2))) \odot \widetilde{B}(Proj(\mathcal{P}_2, sch(B)))} \ & \mathcal{P}_1=[tuple_3,selection1 \wedge selection2] \ & \mathcal{P}_2=[tuple_4,selection3 \wedge condtion4] \ \

& ={\widetilde{A1}([(gh,xm,gj,A14,A15,A16,B11),selection1 \wedge selection2]) \odot \widetilde{B}([(B01,B02,B03,B04),selection1 \wedge selection2])} \ & \diamond {\widetilde{A2}([(gh,xm,gj,A24,A25,A26,B21),selection3 \wedge selection4]) \odot \widetilde{B}([(B01,B02,B03,B04),selection3 \wedge selection4])} \ \

& ={[(gh,xm,gj,A14,A15,A16,B11),selection1 \wedge selection2], \ & [(B01,B02,B03,B04),selection1 \wedge selection2], \ & [(gh,xm,gj,A24,A25,A26,B21),selection3 \wedge selection4], \ & [(B01,B02,B03,B04),selection3 \wedge selection4]} \ \end{align} $$

-- A1
SELECT
  A11,
  A12,
  A13
FROM
  A1
WHERE
  A1.A11 = gh
  AND A1.A12 = xm
  AND A1.A13 = gj

  AND LENGTH(NVL(A1.A11,'')) > 0
  AND A1.A14 <= '#{ETL_DT}'
  AND A1.A15 > '#{ETL_DT}'

  AND A1.B11 = B.B01
  AND B.B02 <= '#{ETL_DT}'
  AND B.B03 > '#{ETL_DT}'

-- A2
SELECT
  A21,
  A22,
  A23
FROM
  A2
WHERE
  A2.A21 = gh
  AND A2.A22 = xm
  AND A2.A23 = gj

  AND LENGTH(NVL(A2.A21,'')) > 0
  AND A2.A24 <= '#{ETL_DT}'
  AND A2.A25 > '#{ETL_DT}'
  AND A2.A26 <> '0101';

  AND A2.B21 = B.B01
  AND B.B02 <= '#{ETL_DT}'
  AND B.B03 > '#{ETL_DT}'

Clone this wiki locally