Bladeren bron

:recycle: complete refactor of data generator | generate data in batches, and insert one file at a time

tags/0.0.1
j 4 jaren geleden
bovenliggende
commit
e8f5684cbf

+ 1
- 1
.gitignore Bestand weergeven

5
 *.local
5
 *.local
6
 **/.env
6
 **/.env
7
 .nyc_output
7
 .nyc_output
8
-**/_generated.js
8
+**/generated/*

+ 2
- 1
backend/README.md Bestand weergeven

29
 * Run `npm run unmigrate` to roll back one migration
29
 * Run `npm run unmigrate` to roll back one migration
30
 
30
 
31
 ### Seeding
31
 ### Seeding
32
-* Run `npm run seed` to seed the database with dummy data
32
+* Run `npm run generate` to generate dummy data
33
+* Run `npm run seed` to seed the database with generated dummy data
33
 
34
 
34
 ### Restarting
35
 ### Restarting
35
 Since we can't unseed the database, it's best to destroy the `dev` database and rebuild it.
36
 Since we can't unseed the database, it's best to destroy the `dev` database and rebuild it.

+ 270
- 154
backend/db/generator.js Bestand weergeven

1
 const fs = require('fs')
1
 const fs = require('fs')
2
 const cosineSimilarity = require('compute-cosine-similarity')
2
 const cosineSimilarity = require('compute-cosine-similarity')
3
-const mockOutputPath = './db/_generated.js'
3
+const mockOutputPath = './db/generated/_batch'
4
+const magic = 1000
4
 
5
 
5
 // Insert here how many users you would like to generate:
6
 // Insert here how many users you would like to generate:
6
-const total = 10
7
+const total = 1000
8
+const batchSize = 100
9
+let batchCount = 1 // Counter to track how many things we've generated
10
+
7
 let extraProfilesToGenerate = 0
11
 let extraProfilesToGenerate = 0
12
+let extraProfileCount = 0 // Counter to track how many EXTRA profiles we've generated
13
+
14
+let generatedResponseCount = 0
8
 
15
 
9
 // Amount of responses for a complete survey
16
 // Amount of responses for a complete survey
10
 const questions = 13
17
 const questions = 13
12
 // Seekers per 100 profiles
19
 // Seekers per 100 profiles
13
 const percentageOfSeekers = 90
20
 const percentageOfSeekers = 90
14
 
21
 
15
-// Values for responses
22
+const scoreVals = [100, 140, 180, 220, 260, 400]
23
+// Values for responsess as strings
16
 const possibleResponses = {
24
 const possibleResponses = {
17
-    not_important: '120',
18
-    some_what_important: '140',
19
-    important: '160',
20
-    very_important: '180',
21
-    extremely_important: '200',
22
-    mandatory: '400',
25
+    not_important: null,
26
+    some_what_important: null,
27
+    important: null,
28
+    very_important: null,
29
+    extremely_important: null,
30
+    mandatory: null,
31
+}
32
+for (let i = 0; i < Object.keys(possibleResponses).length; i++) {
33
+    const key = Object.keys(possibleResponses)[i]
34
+    possibleResponses[key] = scoreVals[i].toString()
35
+}
36
+
37
+/**
38
+ * Our initial file setup
39
+ */
40
+const header = `/**
41
+* GENERATED MOCK SIIMEE DATA
42
+* Generated at: ${Date.now()}
43
+*/
44
+`
45
+const write = async (batchNum, outputDataObject) => {
46
+    await fs.writeFile(`${mockOutputPath}_${batchNum}.js`, '', () => {})
47
+    fs.appendFile(
48
+        `${mockOutputPath}_${batchNum}.js`,
49
+        header + 'module.exports = ' + JSON.stringify(outputDataObject),
50
+        err => {
51
+            if (err) {
52
+                console.error(err)
53
+                return
54
+            }
55
+        },
56
+    )
23
 }
57
 }
24
 
58
 
59
+/**
60
+ * [100, 140, 180, 220, 260, 400]
61
+ */
62
+const preComputedScores = {
63
+    100: {
64
+        100: 0,
65
+        140: 0,
66
+        180: 0,
67
+        220: 0,
68
+        260: 0,
69
+        400: 0,
70
+    },
71
+    140: {
72
+        100: 0,
73
+        140: 0,
74
+        180: 0,
75
+        220: 0,
76
+        260: 0,
77
+        400: 0,
78
+    },
79
+    180: {
80
+        100: 0,
81
+        140: 0,
82
+        180: 0,
83
+        220: 0,
84
+        260: 0,
85
+        400: 0,
86
+    },
87
+    220: {
88
+        100: 0,
89
+        140: 0,
90
+        180: 0,
91
+        220: 0,
92
+        260: 0,
93
+        400: 0,
94
+    },
95
+    260: {
96
+        100: 0,
97
+        140: 0,
98
+        180: 0,
99
+        220: 0,
100
+        260: 0,
101
+        400: 0,
102
+    },
103
+    400: {
104
+        100: 0,
105
+        140: 0,
106
+        180: 0,
107
+        220: 0,
108
+        260: 0,
109
+        400: 0,
110
+    },
111
+}
112
+const score2d = (a, b) => {
113
+    const aScorePlusBase = [100]
114
+    const bScorePlusBase = [100]
115
+    aScorePlusBase.push(a)
116
+    bScorePlusBase.push(b)
117
+    return Math.round(
118
+        Math.pow(cosineSimilarity(aScorePlusBase, bScorePlusBase), 10) * magic,
119
+    )
120
+}
121
+scoreVals.forEach(val => {
122
+    scoreVals.forEach(v => {
123
+        preComputedScores[val][v] = score2d(val, v)
124
+    })
125
+})
126
+
25
 const possibleZipcodes = [
127
 const possibleZipcodes = [
26
     '90065', // Glassel
128
     '90065', // Glassel
27
     '90012', // Chinatown
129
     '90012', // Chinatown
42
         ? 1
144
         ? 1
43
         : Math.floor(Math.random() * max)
145
         : Math.floor(Math.random() * max)
44
 }
146
 }
45
-
46
 const randomValFrom = arr => arr[randomNumber(arr.length)]
147
 const randomValFrom = arr => arr[randomNumber(arr.length)]
47
 const randomEmail = (length = 5) => {
148
 const randomEmail = (length = 5) => {
48
     let chars =
149
     let chars =
74
 const generate = (classObj, amount, meta) => {
175
 const generate = (classObj, amount, meta) => {
75
     const instances = []
176
     const instances = []
76
     for (let i = 0; i < amount; i++) {
177
     for (let i = 0; i < amount; i++) {
77
-        instances.push(new classObj(i + 1, meta))
178
+        let startFrom = meta?.starting ? meta.starting - batchSize : 0
179
+        instances.push(new classObj(i + startFrom + 1, meta))
78
     }
180
     }
79
     return instances
181
     return instances
80
 }
182
 }
90
 }
192
 }
91
 class Profile {
193
 class Profile {
92
     constructor(id, override) {
194
     constructor(id, override) {
93
-        this.user_id = override ? override.user_id : id
94
-        this.profile_id = override ? override.profile_id + id : id
195
+        this.user_id = override?.user_id ? override.user_id : id
196
+        this.profile_id = override?.profile_id ? override.profile_id + id : id
95
     }
197
     }
96
 }
198
 }
97
 class Response {
199
 class Response {
103
     }
205
     }
104
 }
206
 }
105
 
207
 
106
-const users = generate(User, total)
107
-users.forEach(user => {
108
-    user.is_poster = randomNumber(100) > percentageOfSeekers ? 1 : 0
109
-    if (user.is_poster) {
110
-        extraProfilesToGenerate = extraProfilesToGenerate + randomNumber(2)
111
-    }
112
-    user.user_name = randomName() + ' ' + randomName()
113
-    user.user_email = randomEmail()
114
-})
115
-let jobPosterIds = users
116
-    .filter(user => user.is_poster > 0)
117
-    .map(user => user.user_id)
208
+console.log('\nStarting...\n---')
118
 
209
 
119
-// Guarentee ONE job poster
120
-if (!jobPosterIds.length) {
121
-    randomValFrom(users).is_poster = 1
122
-    jobPosterIds = users
210
+for (let batch = batchSize; batch <= total; batch += batchSize) {
211
+    /**
212
+     * Generate Users
213
+     */
214
+    let users = generate(User, batchSize, {
215
+        starting: batchSize * batchCount,
216
+    })
217
+    users.forEach(user => {
218
+        user.is_poster = randomNumber(100) > percentageOfSeekers ? 1 : 0
219
+        if (user.is_poster) {
220
+            extraProfilesToGenerate = extraProfilesToGenerate + randomNumber(2)
221
+        }
222
+        user.user_name = randomName() + ' ' + randomName()
223
+        user.user_email = randomEmail()
224
+    })
225
+    let jobPosterIds = users
123
         .filter(user => user.is_poster > 0)
226
         .filter(user => user.is_poster > 0)
124
         .map(user => user.user_id)
227
         .map(user => user.user_id)
125
-}
228
+    console.log('COMPLETED: Generated Users...')
126
 
229
 
127
-const profiles = generate(Profile, total)
128
-
129
-// Generate extra job posting profiles
130
-// attributed to random user.is_poster === true
131
-// TODO: Clean this up. Hard to read...
132
-if (extraProfilesToGenerate > 0) {
133
-    let extras = []
134
-    for (let l = 0; l < extraProfilesToGenerate; l++) {
135
-        const generatedExtraProfiles = generate(Profile, 1, {
136
-            user_id:
137
-                jobPosterIds.length > 1
138
-                    ? randomValFrom(jobPosterIds)
139
-                    : jobPosterIds[0],
140
-            profile_id: profiles.length + l,
141
-        })
142
-        extras = [...extras, ...generatedExtraProfiles]
230
+    // Guarentee ONE job poster
231
+    if (!jobPosterIds.length) {
232
+        randomValFrom(users).is_poster = 1
233
+        jobPosterIds = users
234
+            .filter(user => user.is_poster > 0)
235
+            .map(user => user.user_id)
143
     }
236
     }
144
-    extras.forEach(profile => profiles.push(profile))
145
-}
146
 
237
 
147
-// Generate responses, then fill in details
148
-const responses = generate(
149
-    Response,
150
-    (total + extraProfilesToGenerate) * questions,
151
-)
152
-profiles.forEach((profile, i) => {
153
-    const startingIndex = i * questions
154
-    for (let k = 0; k < questions; k++) {
155
-        const resToEdit = responses[startingIndex + k]
156
-        resToEdit.response_key_id = k + 1
157
-        resToEdit.profile_id = profile.profile_id
158
-        resToEdit.val =
159
-            k + 1 == questions
160
-                ? randomValFrom(possibleZipcodes)
161
-                : randomValFrom(Object.values(possibleResponses))
238
+    /**
239
+     * Generate Profiles
240
+     */
241
+    let profiles = generate(Profile, batchSize, {
242
+        starting: batchSize * batchCount,
243
+        profile_id: extraProfileCount,
244
+    })
245
+    // Generate extra job posting profiles
246
+    // attributed to random user.is_poster === true
247
+    // TODO: Clean this up. Hard to read...
248
+    if (extraProfilesToGenerate > 0) {
249
+        let extras = []
250
+        for (let l = 0; l < extraProfilesToGenerate; l++) {
251
+            const generatedExtraProfiles = generate(Profile, 1, {
252
+                user_id:
253
+                    jobPosterIds.length > 1
254
+                        ? randomValFrom(jobPosterIds)
255
+                        : jobPosterIds[0],
256
+                profile_id: batchSize * batchCount + extraProfileCount + l,
257
+            })
258
+            extras = [...extras, ...generatedExtraProfiles]
259
+        }
260
+        extras.forEach(profile => {
261
+            profiles.push(profile)
262
+            extraProfileCount++
263
+        })
162
     }
264
     }
163
-})
164
-
165
-/**
166
- * Score all the profiles!
167
- */
168
-const scoreResponses = (seeker, potentialMatch) => {
169
-    const seekerResponses = responses
170
-        .filter(response => response.profile_id == seeker.profile_id)
171
-        .filter(response => response.val.length < 4)
172
-
173
-    const potentialMatchResponses = responses
174
-        .filter(response => response.profile_id == potentialMatch.profile_id)
175
-        .filter(response => response.val.length < 4)
265
+    console.log('COMPLETED: Generated Profiles...')
176
 
266
 
177
-    const checkValCb = res => {
178
-        const val = parseInt(res.val)
179
-        return isNaN(val) ? 0 : val
180
-    }
181
-    return Math.floor(
182
-        cosineSimilarity(
183
-            seekerResponses.map(checkValCb),
184
-            potentialMatchResponses.map(checkValCb),
185
-        ) * 1000,
267
+    /**
268
+     * Generate Responses
269
+     */
270
+    // Generate responses first, before filling in details
271
+    let responses = generate(
272
+        Response,
273
+        (batchSize + extraProfilesToGenerate) * questions,
274
+        { starting: generatedResponseCount + batchSize },
186
     )
275
     )
187
-}
188
-const scoreProfile = (profile, potentialMatchList) => {
189
-    const scored = potentialMatchList.map(profileToCompare => {
190
-        return {
191
-            match_queue_id: null,
192
-            profile_id: profile.profile_id,
193
-            target_id: profileToCompare.profile_id,
194
-            is_deleted: false,
195
-            score: scoreResponses(profile, profileToCompare),
276
+    profiles.forEach((profile, i) => {
277
+        const startingIndex = i * questions
278
+        for (let k = 0; k < questions; k++) {
279
+            const resToEdit = responses[startingIndex + k]
280
+            resToEdit.response_key_id = k + 1
281
+            resToEdit.profile_id = profile.profile_id
282
+            resToEdit.val =
283
+                k + 1 == questions
284
+                    ? randomValFrom(possibleZipcodes)
285
+                    : randomValFrom(Object.values(possibleResponses))
196
         }
286
         }
197
     })
287
     })
198
-    return scored.sort((a, b) => a.score - b.score)
199
-}
200
-const scoreAll = () => {
201
-    let scores = []
202
-    const posterProfiles = profiles.filter(profile =>
203
-        jobPosterIds.includes(profile.user_id),
204
-    )
205
-    const seekerProfiles = profiles.filter(
206
-        profile => !jobPosterIds.includes(profile.user_id),
207
-    )
208
-    seekerProfiles.forEach(seeker => {
209
-        const scored = scoreProfile(seeker, posterProfiles)
210
-        scores = [...scored, ...scores]
211
-    })
212
-    posterProfiles.forEach(poster => {
213
-        const scored = scoreProfile(poster, seekerProfiles)
214
-        scores = [...scored, ...scores]
215
-    })
216
-    return scores.reverse()
288
+    generatedResponseCount = generatedResponseCount + responses.length
289
+    console.log('COMPLETED: Generated Responses...')
290
+
291
+    /**
292
+     * Our output format
293
+     */
294
+    const outputDataObject = { users, profiles, responses }
295
+    // const outputDataObject = { users, profiles, responses, match_queues }
296
+    write(batchSize * batchCount, outputDataObject)
297
+    batchCount++
217
 }
298
 }
218
-const match_queues = scoreAll().map((score, i) => {
219
-    score.match_queue_id = i + 1
220
-    // Comment out to see the score
221
-    delete score.score
222
-    return score
223
-})
224
 
299
 
225
-/**
226
- * Our output format
227
- */
228
-const outputDataObject = { users, profiles, responses, match_queues }
300
+// /**
301
+//  * Score all the profiles!
302
+//  */
303
+// const compareProfileResponses = (seeker, potentialMatch) => {
304
+//     const checkValCb = res => {
305
+//         const val = parseInt(res.val)
306
+//         return isNaN(val) ? 0 : val
307
+//     }
308
+//     const filterBy = idToCheckFor => {
309
+//         return responses
310
+//             .filter(
311
+//                 response =>
312
+//                     response.profile_id == idToCheckFor &&
313
+//                     response.val.length < 4,
314
+//             )
315
+//             .map(checkValCb)
316
+//     }
317
+//     const seekerResponses = filterBy(seeker.profile_id)
318
+//     const potentialMatchResponses = filterBy(potentialMatch.profile_id)
319
+//     const cachedScores = []
320
+//     seekerResponses.forEach(seekerResponse => {
321
+//         potentialMatchResponses.forEach(potentialResponse => {
322
+//             cachedScores.push(
323
+//                 preComputedScores[seekerResponse][potentialResponse],
324
+//             )
325
+//         })
326
+//     })
327
+//     return Math.round(
328
+//         cachedScores.reduce((a, b) => a + b) / cachedScores.length,
329
+//     )
330
+// }
331
+// const scoreProfile = (profile, potentialMatchList) => {
332
+//     return potentialMatchList
333
+//         .map(profileToCompare => {
334
+//             return {
335
+//                 match_queue_id: null,
336
+//                 profile_id: profile.profile_id,
337
+//                 target_id: profileToCompare.profile_id,
338
+//                 is_deleted: false,
339
+//                 score: compareProfileResponses(profile, profileToCompare),
340
+//             }
341
+//         })
342
+//         .sort((a, b) => a.score - b.score)
343
+// }
229
 
344
 
230
-const jobPostings = profiles.filter(profile =>
231
-    jobPosterIds.includes(profile.user_id),
232
-).length
233
-const jobPosters = users.filter(user => user.is_poster > 0).length
234
-const header = `/**
235
- * GENERATED MOCK SIIMEE DATA
236
- * Generated at: ${Date.now()}
237
- * ---
238
- * ${jobPostings} positions listed by ${jobPosters} job posters
239
- * ${total + extraProfilesToGenerate - jobPostings} candidate profiles by ${
240
-    total + extraProfilesToGenerate - jobPostings
241
-} job seekers
242
- * ---
243
- * ${total + extraProfilesToGenerate} Profiles
244
- * ${total} Users
245
- */
246
-`
247
-const write = async () => {
248
-    await fs.writeFile(mockOutputPath, '', () => {})
249
-    fs.appendFile(
250
-        mockOutputPath,
251
-        header + 'module.exports = ' + JSON.stringify(outputDataObject),
252
-        err => {
253
-            if (err) {
254
-                console.error(err)
255
-                return
256
-            }
257
-        },
258
-    )
259
-}
260
-write()
345
+// const scoreAll = () => {
346
+//     process.stdout.write('\nScoring Profiles')
347
+//     let scores = []
348
+//     const posterProfiles = profiles.filter(profile =>
349
+//         jobPosterIds.includes(profile.user_id),
350
+//     )
351
+//     const seekerProfiles = profiles.filter(
352
+//         profile => !jobPosterIds.includes(profile.user_id),
353
+//     )
354
+//     process.stdout.write('.')
355
+//     for (let i = 0; i < seekerProfiles.length; i++) {
356
+//         const scored = scoreProfile(seekerProfiles[i], posterProfiles)
357
+//         scores.push(...scored)
358
+//     }
359
+//     process.stdout.write('.')
360
+//     for (let j = 0; j < posterProfiles.length; j++) {
361
+//         const scored = scoreProfile(posterProfiles[j], seekerProfiles)
362
+//         scores.push(...scored)
363
+//     }
364
+//     process.stdout.write('.')
365
+//     console.log('\n\nCOMPLETED: Scoring Profiles...')
366
+//     return scores.reverse()
367
+// }
368
+
369
+// const match_queues = scoreAll().map((score, i) => {
370
+//     score.match_queue_id = i + 1
371
+//     // Comment out  next line to see the scores
372
+//     delete score.score
373
+//     return score
374
+// })
375
+
376
+console.log('---\nFINISHED...\n===\n')

+ 12
- 4
backend/db/seeds/01-users.js Bestand weergeven

1
 const mock = require('../mock')
1
 const mock = require('../mock')
2
-const generated = require('../_generated')
2
+const fs = require('fs')
3
 
3
 
4
-exports.seed = function (knex) {
4
+let users = []
5
+const generatedDataPath = './db/generated'
6
+let fileNames = fs.readdirSync(generatedDataPath)
7
+for (let name of fileNames) {
8
+    const data = require(`../generated/${name}`)
9
+    users = [...users, ...data.users]
10
+}
11
+
12
+exports.seed = knex => {
5
     // Deletes ALL existing entries
13
     // Deletes ALL existing entries
6
     return knex('users')
14
     return knex('users')
7
         .truncate()
15
         .truncate()
8
         .then(function () {
16
         .then(function () {
9
             // Inserts seed entries
17
             // Inserts seed entries
10
-            return generated.users
11
-                ? knex('users').insert(generated.users)
18
+            return users
19
+                ? knex('users').insert(users)
12
                 : knex('users').insert(mock.users)
20
                 : knex('users').insert(mock.users)
13
         })
21
         })
14
 }
22
 }

+ 12
- 4
backend/db/seeds/02-profiles.js Bestand weergeven

1
 const mock = require('../mock')
1
 const mock = require('../mock')
2
-const generated = require('../_generated')
2
+const fs = require('fs')
3
 
3
 
4
-exports.seed = function (knex) {
4
+let profiles = []
5
+const generatedDataPath = './db/generated'
6
+let fileNames = fs.readdirSync(generatedDataPath)
7
+for (let name of fileNames) {
8
+    const data = require(`../generated/${name}`)
9
+    profiles = [...profiles, ...data.profiles]
10
+}
11
+
12
+exports.seed = knex => {
5
     // Deletes ALL existing entries
13
     // Deletes ALL existing entries
6
     return knex('profiles')
14
     return knex('profiles')
7
         .truncate()
15
         .truncate()
8
         .then(function () {
16
         .then(function () {
9
             // Inserts seed entries
17
             // Inserts seed entries
10
-            return generated.profiles
11
-                ? knex('profiles').insert(generated.profiles)
18
+            return profiles
19
+                ? knex('profiles').insert(profiles)
12
                 : knex('profiles').insert(mock.profiles)
20
                 : knex('profiles').insert(mock.profiles)
13
         })
21
         })
14
 }
22
 }

+ 12
- 4
backend/db/seeds/04-responses.js Bestand weergeven

1
 const mock = require('../mock')
1
 const mock = require('../mock')
2
-const generated = require('../_generated')
2
+const fs = require('fs')
3
 
3
 
4
-exports.seed = function (knex) {
4
+let responses = []
5
+const generatedDataPath = './db/generated'
6
+let fileNames = fs.readdirSync(generatedDataPath)
7
+for (let name of fileNames) {
8
+    const data = require(`../generated/${name}`)
9
+    responses = [...responses, ...data.responses]
10
+}
11
+
12
+exports.seed = knex => {
5
     // Deletes ALL existing entries
13
     // Deletes ALL existing entries
6
     return knex('responses')
14
     return knex('responses')
7
         .truncate()
15
         .truncate()
8
         .then(function () {
16
         .then(function () {
9
             // Inserts seed entries
17
             // Inserts seed entries
10
-            return generated.responses
11
-                ? knex('responses').insert(generated.responses)
18
+            return responses
19
+                ? knex('responses').insert(responses)
12
                 : knex('responses').insert(mock.responses)
20
                 : knex('responses').insert(mock.responses)
13
         })
21
         })
14
 }
22
 }

+ 1
- 5
backend/db/seeds/09-match_queues.js Bestand weergeven

1
 const mock = require('../mock')
1
 const mock = require('../mock')
2
-const generated = require('../_generated')
3
-
4
 exports.seed = function (knex) {
2
 exports.seed = function (knex) {
5
     // Deletes ALL existing entries
3
     // Deletes ALL existing entries
6
     return knex('match_queues')
4
     return knex('match_queues')
7
         .truncate()
5
         .truncate()
8
         .then(function () {
6
         .then(function () {
9
             // Inserts seed entries
7
             // Inserts seed entries
10
-            return generated.match_queues
11
-                ? knex('match_queues').insert(generated.match_queues)
12
-                : knex('match_queues').insert(mock.match_queues)
8
+            return knex('match_queues').insert(mock.match_queues)
13
         })
9
         })
14
 }
10
 }

Laden…
Annuleren
Opslaan