Skip to content

Commit

Permalink
wip
Browse files Browse the repository at this point in the history
  • Loading branch information
andrefs committed Oct 18, 2023
1 parent b8030c1 commit c235a2d
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 33 deletions.
Binary file modified docs/entities.dia
Binary file not shown.
Binary file removed docs/entities.dia.autosave
Binary file not shown.
66 changes: 33 additions & 33 deletions src/manager/lib/Manager.ts
Original file line number Diff line number Diff line change
Expand Up @@ -360,39 +360,39 @@ export default class Manager {
}
}

async *domainsToCrawl(
workerId: string,
limit: number,
resourcesPerDomain: number
) {
let noDomainsFound = true;
for await (const domain of Domain.domainsToCrawl(workerId, limit)) {
noDomainsFound = false;
const filter = {
domain: domain.origin,
status: 'unvisited',
minPathLength: { $lt: config.graph.maxPathLength },
headCount: { $gt: 0 },
};
const heads = await Resource.find(filter)
.sort('-headCount')
.select('url')
.limit(resourcesPerDomain || 10)
.lean();
await Resource.updateMany(
{ url: { $in: heads.map((h) => h.url) } },
{ status: 'crawling', jobId: domain.jobId }
).lean();
await Domain.updateOne(
{ origin: domain.origin, jobId: domain.jobId },
{ 'crawl.ongoing': heads.length }
);
yield { domain, resources: heads };
}
if (noDomainsFound) {
//log.warn('No domains left to crawl!');
}
}
//async *domainsToCrawl(
// workerId: string,
// limit: number,
// resourcesPerDomain: number
//) {
// let noDomainsFound = true;
// for await (const domain of Domain.domainsToCrawl(workerId, limit)) {
// noDomainsFound = false;
// const filter = {
// domain: domain.origin,
// status: 'unvisited',
// minPathLength: { $lt: config.graph.maxPathLength },
// headCount: { $gt: 0 },
// };
// const heads = await Resource.find(filter)
// .sort('-headCount')
// .select('url')
// .limit(resourcesPerDomain || 10)
// .lean();
// await Resource.updateMany(
// { url: { $in: heads.map((h) => h.url) } },
// { status: 'crawling', jobId: domain.jobId }
// ).lean();
// await Domain.updateOne(
// { origin: domain.origin, jobId: domain.jobId },
// { 'crawl.ongoing': heads.length }
// );
// yield { domain, resources: heads };
// }
// if (noDomainsFound) {
// //log.warn('No domains left to crawl!');
// }
//}

async *assignJobs(
workerId: string,
Expand Down
2 changes: 2 additions & 0 deletions src/models/Domain.ts
Original file line number Diff line number Diff line change
Expand Up @@ -462,6 +462,7 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
let procSkip = 0;
let pathLimit = 20; // TODO get from config

// iterate over processes
PROCESS_LOOP: while (domainsFound < domLimit) {
const proc = await Process.getOneRunning(procSkip);
if (!proc) {
Expand All @@ -470,6 +471,7 @@ schema.statics.domainsToCrawl2 = async function* (wId, domLimit, resLimit) {
procSkip++;

let pathSkip = 0;
// iterate over process' paths
PATHS_LOOP: while (domainsFound < domLimit) {
const paths = await proc.getPaths(pathSkip, pathLimit);

Expand Down

0 comments on commit c235a2d

Please sign in to comment.