How to translate a simple "semijoin" query from a SQL query into JavaScript?-CodePudding

According to this SO answer, we have:

SELECT DISTINCT s.id
FROM students s
LEFT JOIN grades g ON g.student_id = s.id
WHERE g.student_id IS NOT NULL

Which uses a LEFT JOIN. But it can be implemented with a "semijoin" like this:

SELECT s.id
FROM  students s
WHERE EXISTS (SELECT 1 FROM grades g
              WHERE g.student_id = s.id)

How do you write these two queries in JavaScript, given two collections students and grades? The first one I think is implemented with a "nested loop join" (or could be optimized with an index join data structure, but I don't know how to implement that just yet).

const query1Results = nestedJoin(students, grades, (s, g) => {
  return Boolean(g.student_id && g.student_id === s.id)
}).map(([s]) => s.id)

function nestedJoin(R, S, compare) {
  const out = []
  
  for (const r of R) {
    for (const s of S) {
      if (compare(r, s)) {
        out.push([ r, s ])
      }
    }
  }

  return out
}

How would you do the second one though?

const query2Results = semiJoin(students, grades, (s, g) => {
  return g.student_id === s.id
}).map(([s]) => s.id)

function semiJoin(R, S) {
  const out = []
  
  for (const r of R) {
    for (const s of S) {
      if (compare(r, s)) {
        out.push([ r, s ])
      }
    }
  }

  return out
}

Basically, I implemented it the same way.

const students = [
  { id: 1 },
  { id: 2 },
  { id: 3 },
  { id: 4 },
  { id: 5 },
  { id: 6 },
]
const grades = [
  { id: 1, student_id: 1 },
  { id: 2, student_id: 1 },
  { id: 3, student_id: 1 },
  { id: 4, student_id: 1 },
  { id: 5, student_id: 1 },
  { id: 6, student_id: 1 },
  { id: 10, student_id: 2 },
  { id: 20, student_id: 2 },
  { id: 30, student_id: 2 },
  { id: 40, student_id: 2 },
  { id: 50, student_id: 2 },
  { id: 60, student_id: 2 },
  { id: 100, student_id: 3 },
  { id: 200, student_id: 3 },
  { id: 300, student_id: 3 },
  { id: 400, student_id: 3 },
  { id: 500, student_id: 3 },
  { id: 600, student_id: 3 },
]

const expectedOutput = [
  1, 2, 3
]

CodePudding user response：

This really does depend on your table design and indexes but assuming you have one on grades.student_id, the JS equivalent might look like

const students = [{"id":1},{"id":2},{"id":3},{"id":4},{"id":5},{"id":6}]
const grades = [{"id":1,"student_id":1},{"id":2,"student_id":1},{"id":3,"student_id":1},{"id":4,"student_id":1},{"id":5,"student_id":1},{"id":6,"student_id":1},{"id":10,"student_id":2},{"id":20,"student_id":2},{"id":30,"student_id":2},{"id":40,"student_id":2},{"id":50,"student_id":2},{"id":60,"student_id":2},{"id":100,"student_id":3},{"id":200,"student_id":3},{"id":300,"student_id":3},{"id":400,"student_id":3},{"id":500,"student_id":3},{"id":600,"student_id":3}]

// index
const idxGradesStudentId = new Set(grades.map(({ student_id }) => student_id))

function semiJoin(R, S) {
  // WHERE EXISTS
  return R.filter(({ id }) => idxGradesStudentId.has(id))
    .map(({ id }) => id) // SELECT
}

console.log(semiJoin(students, grades))

So rather than a nested loop, this uses the index on grades.student_id (represented by a Set) to evaluate the EXISTS clause (Set.prototype.has()) which has the benefit of O(1) lookup time complexity.