NomClient.java

  1. package no.nav.data.team.resource;

  2. import com.fasterxml.jackson.databind.ObjectMapper;
  3. import io.prometheus.client.Counter;
  4. import io.prometheus.client.Gauge;
  5. import lombok.SneakyThrows;
  6. import lombok.extern.slf4j.Slf4j;
  7. import no.nav.data.common.exceptions.TechnicalException;
  8. import no.nav.data.common.rest.RestResponsePage;
  9. import no.nav.data.common.storage.StorageService;
  10. import no.nav.data.common.storage.domain.GenericStorage;
  11. import no.nav.data.common.utils.MetricUtils;
  12. import no.nav.data.team.resource.domain.Resource;
  13. import no.nav.data.team.resource.domain.ResourceEvent;
  14. import no.nav.data.team.resource.domain.ResourceEvent.EventType;
  15. import no.nav.data.team.resource.domain.ResourceRepository;
  16. import no.nav.data.team.resource.domain.ResourceType;
  17. import no.nav.data.team.resource.dto.NomRessurs;
  18. import no.nav.data.team.settings.SettingsService;
  19. import no.nav.data.team.settings.dto.Settings;
  20. import org.apache.commons.codec.language.DoubleMetaphone;
  21. import org.apache.lucene.analysis.Analyzer;
  22. import org.apache.lucene.analysis.LowerCaseFilter;
  23. import org.apache.lucene.analysis.TokenStream;
  24. import org.apache.lucene.analysis.Tokenizer;
  25. import org.apache.lucene.analysis.core.WhitespaceTokenizer;
  26. import org.apache.lucene.analysis.miscellaneous.ASCIIFoldingFilter;
  27. import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
  28. import org.apache.lucene.analysis.ngram.EdgeNGramTokenFilter;
  29. import org.apache.lucene.analysis.phonetic.DoubleMetaphoneFilter;
  30. import org.apache.lucene.analysis.standard.StandardTokenizer;
  31. import org.apache.lucene.document.Document;
  32. import org.apache.lucene.document.Field.Store;
  33. import org.apache.lucene.document.TextField;
  34. import org.apache.lucene.index.*;
  35. import org.apache.lucene.search.BooleanClause;
  36. import org.apache.lucene.search.BooleanQuery;
  37. import org.apache.lucene.search.BoostQuery;
  38. import org.apache.lucene.search.IndexSearcher;
  39. import org.apache.lucene.search.MultiPhraseQuery;
  40. import org.apache.lucene.search.Query;
  41. import org.apache.lucene.search.ScoreDoc;
  42. import org.apache.lucene.search.Sort;
  43. import org.apache.lucene.search.TermQuery;
  44. import org.apache.lucene.store.ByteBuffersDirectory;
  45. import org.apache.lucene.store.Directory;
  46. import org.springframework.scheduling.annotation.Scheduled;
  47. import org.springframework.stereotype.Service;

  48. import java.io.IOException;
  49. import java.util.*;
  50. import java.util.stream.Collectors;
  51. import java.util.stream.Stream;

  52. import static java.util.Comparator.comparing;
  53. import static java.util.stream.Collectors.groupingBy;
  54. import static no.nav.data.common.utils.StreamUtils.convert;
  55. import static org.apache.lucene.queryparser.classic.QueryParserBase.escape;

  56. @Slf4j
  57. @Service
  58. public class
  59. NomClient {

  60.     private static final int MAX_SEARCH_RESULTS = 100;

  61.     private static final Gauge gauge = MetricUtils.gauge()
  62.             .name("nom_resources_gauge").help("Resources from nom indexed").register();
  63.     private static final Gauge dbGauge = MetricUtils.gauge()
  64.             .name("nom_resources_db_gauge").help("Resources from nom in db").register();
  65.     private static final Counter counter = MetricUtils.counter()
  66.             .name("nom_resources_read_counter").help("Resource events processed").register();
  67.     private static final Counter discardCounter = MetricUtils.counter()
  68.             .name("nom_resources_discard_counter").help("Resource events discarded").register();

  69.     private final StorageService storage;
  70.     private final SettingsService settingsService;
  71.     private final ResourceRepository resourceRepository;

  72.     private static NomClient instance;

  73.     public static NomClient getInstance() {
  74.         return instance;
  75.     }

  76.     public NomClient(StorageService storage, SettingsService settingsService, ResourceRepository resourceRepository) {
  77.         this.storage = storage;
  78.         this.settingsService = settingsService;
  79.         this.resourceRepository = resourceRepository;
  80.         // Initialize index
  81.         try (var writer = ResourceState.createWriter()) {
  82.             writer.commit();
  83.         } catch (Exception e) {
  84.             throw new TechnicalException("io error", e);
  85.         }
  86.         instance = this;
  87.     }

  88.     public Optional<Resource> getByNavIdent(String navIdent) {
  89.         return ResourceState.get(navIdent)
  90.                 .or(() -> resourceRepository.findByIdent(navIdent).map(GenericStorage::toResource).map(Resource::stale))
  91.                 .filter(r -> shouldReturn(r.getNavIdent()));
  92.     }

  93.     public Optional<Resource> getByEmail(String email) {
  94.         return ResourceState.getByEmail(email)
  95.                 .filter(r -> shouldReturn(r.getNavIdent()));
  96.     }

  97.     public Optional<String> getNameForIdent(String navIdent) {
  98.         return Optional.ofNullable(navIdent)
  99.                 .filter(this::shouldReturn)
  100.                 .flatMap(this::getByNavIdent)
  101.                 .map(Resource::getFullName);
  102.     }

  103.     @SneakyThrows
  104.     public RestResponsePage<Resource> search(String searchString) {

  105.         try (var reader = ResourceState.createReader()) {
  106.             IndexSearcher searcher = new IndexSearcher(reader);
  107.             var q = searchStringToCustomQuery(searchString, searcher);

  108.             var top = searcher.search(q, MAX_SEARCH_RESULTS, Sort.RELEVANCE);
  109.             log.debug("query '{}' hits {} returned {}", q.toString(), top.totalHits.value(), top.scoreDocs.length);
  110.             List<Resource> list = Stream.of(top.scoreDocs)
  111.                     .map(sd -> getIdent(sd, searcher))
  112.                     .filter(this::shouldReturn)
  113.                     .map(navIdent -> getByNavIdent(navIdent).orElseThrow())

  114.                     // this is easier than adding "membership" fields to the lucene index that needs to be kept in sync
  115.                     .sorted(compareNavIdByMembershipStatus())
  116.                     .collect(Collectors.toList());


  117.             return new RestResponsePage<>(list, top.totalHits.value());
  118.         } catch (IOException e) {
  119.             log.error("Failed to read lucene index", e);
  120.             throw new TechnicalException("Failed to read lucene index", e);
  121.         }
  122.     }

  123.     // navIds associated with memberships should be valued higher
  124.     private Comparator<Resource> compareNavIdByMembershipStatus() {
  125. //        return (a, b) -> {
  126. //            var aIsMemberSomewhere = isMemberSomewhere(a.getNavIdent());
  127. //            var bIsMemberSomewhere = isMemberSomewhere(b.getNavIdent());
  128. //            if (aIsMemberSomewhere == bIsMemberSomewhere) return 0;
  129. //            return (aIsMemberSomewhere) ? -1 : 1;
  130. //        };
  131.         return (a,b) -> 0;
  132.     }

  133.     private Boolean isMemberSomewhere(String navIdent) {
  134.         // TODO, consult team and area memberships
  135.         var rand = new Random(navIdent.hashCode());
  136.         return rand.nextBoolean();
  137.     }

  138.     @SneakyThrows
  139.     private Query searchStringToCustomQuery(String searchString, IndexSearcher searcher) {

  140.         var phrasePhoneticQryBuilder = new MultiPhraseQuery.Builder().setSlop(4);
  141.         var phraseNgramQryBuilder = new MultiPhraseQuery.Builder().setSlop(4);
  142.         var phraseVerbatimQryBuilder = new MultiPhraseQuery.Builder().setSlop(4);

  143.         var booleanPhoneticQryBuilder = new BooleanQuery.Builder();
  144.         var booleanNgramQryBuilder = new BooleanQuery.Builder();
  145.         var booleanVerbatimQryBuilder = new BooleanQuery.Builder();


  146.         var esc = escape(searchString.toLowerCase().replace("-", " ")).trim();
  147.         var splitString = esc.split(" +");
  148.         var doubleMetaphoneEncoder = new DoubleMetaphone();

  149.         for (var s : splitString) {
  150.             var sMetaphone = doubleMetaphoneEncoder.doubleMetaphone(s);

  151.             phrasePhoneticQryBuilder.add(new Term(ResourceState.FIELD_NAME_PHONETIC, sMetaphone));
  152.             phraseNgramQryBuilder.add(new Term(ResourceState.FIELD_NAME_NGRAMS, s));
  153.             phraseVerbatimQryBuilder.add(new Term(ResourceState.FIELD_NAME_VERBATIM, s));

  154.             booleanPhoneticQryBuilder.add(new TermQuery(new Term(ResourceState.FIELD_NAME_PHONETIC, sMetaphone)), BooleanClause.Occur.SHOULD);
  155.             booleanNgramQryBuilder.add(new TermQuery(new Term(ResourceState.FIELD_NAME_NGRAMS, s)), BooleanClause.Occur.SHOULD);
  156.             booleanVerbatimQryBuilder.add(new TermQuery(new Term(ResourceState.FIELD_NAME_VERBATIM, s)), BooleanClause.Occur.SHOULD);
  157.         }

  158.         var phrasePhoneticQry = phrasePhoneticQryBuilder.build();
  159.         var phraseNgramQry = phraseNgramQryBuilder.build();
  160.         var phraseVerbatimQry = phraseVerbatimQryBuilder.build();

  161.         var booleanPhoneticQry = booleanPhoneticQryBuilder.build();
  162.         var booleanNgramQry = booleanNgramQryBuilder.build();
  163.         var booleanVerbatimQry = booleanVerbatimQryBuilder.build();

  164. //        var boost1 = 1.5f;
  165. //        phrasePhoneticQry.createWeight(searcher, ScoreMode.COMPLETE, boost1);
  166. //        phraseNgramQry.createWeight(searcher, ScoreMode.COMPLETE, boost1);
  167. //        phraseVerbatimQry.createWeight(searcher, ScoreMode.COMPLETE, boost1);
  168. //
  169. //        var boost2 = 1f;
  170. //        booleanPhoneticQry.createWeight(searcher, ScoreMode.COMPLETE, boost2);
  171. //        booleanNgramQry.createWeight(searcher, ScoreMode.COMPLETE, boost2);
  172. //        booleanVerbatimQry.createWeight(searcher, ScoreMode.COMPLETE, boost2);


  173.         var overallBooleanQueryBuilder = new BooleanQuery.Builder()
  174.                 .add(booleanPhoneticQry, BooleanClause.Occur.SHOULD)
  175.                 .add(booleanNgramQry, BooleanClause.Occur.SHOULD)
  176.                 .add(booleanVerbatimQry, BooleanClause.Occur.SHOULD);

  177.         var overallQueryBuilder = new BooleanQuery.Builder()
  178.                 .add(phrasePhoneticQry, BooleanClause.Occur.SHOULD)
  179.                 .add(phraseNgramQry, BooleanClause.Occur.SHOULD)
  180.                 .add(phraseVerbatimQry, BooleanClause.Occur.SHOULD)
  181.                 .add(new BoostQuery(overallBooleanQueryBuilder.build(), 0.05f), BooleanClause.Occur.SHOULD);


  182.         var overallQry = overallQueryBuilder.build();
  183. //        overallQry.createWeight(searcher,ScoreMode.TOP_DOCS,0.5f);

  184.         return overallQueryBuilder.build();
  185.     }


  186.     public List<Resource> add(List<NomRessurs> nomResources) {
  187.         if (count() == 0) { // State er tom == Startup => re-laste ResourceState fra basen
  188.             storage.getAll(Resource.class).forEach( r -> {
  189.                     if (r.getNavIdent().equals("M166609")) log.debug("Adding M166609 to repo");
  190.                     ResourceState.put(r);
  191.                 }
  192.             );
  193.         }
  194.         try {
  195.             var toSave = new ArrayList<Resource>();
  196.             try (var writer = ResourceState.createWriter()) {
  197.                 Map<String, Resource> existingState = ResourceState.findAll(convert(nomResources, NomRessurs::getNavident)).stream().collect(Collectors.toMap(r -> r.getNavIdent(), r -> r));
  198.                 for (NomRessurs nomResource : nomResources) {
  199.                     var resource = new Resource(nomResource);
  200.                     ResourceStatus status = shouldSave(existingState, resource);
  201.                     if (status.shouldSave) {
  202.                         toSave.add(resource);
  203.                         if (status.previous != null) {
  204.                             checkEvents(status.previous, resource);
  205.                         }
  206.                         ResourceState.put(resource);
  207.                     }

  208.                     var luceneIdent = resource.getNavIdent().toLowerCase();
  209.                     var identTerm = new Term(ResourceState.FIELD_IDENT, luceneIdent);
  210.                     if (resource.getResourceType() == ResourceType.OTHER) {
  211.                         // Other resource types shouldn't be searchable, they should not ordinarily be a part of teams
  212.                         writer.deleteDocuments(identTerm);
  213.                         discardCounter.inc();
  214.                         continue;
  215.                     }
  216.                     Document doc = new Document();
  217.                     String name = resource.getGivenName() + " " + resource.getFamilyName();
  218.                     doc.add(new TextField(ResourceState.FIELD_NAME_VERBATIM, name, Store.NO));
  219.                     doc.add(new TextField(ResourceState.FIELD_NAME_NGRAMS, name, Store.NO));
  220.                     doc.add(new TextField(ResourceState.FIELD_NAME_PHONETIC, name, Store.NO));

  221.                     doc.add(new TextField(ResourceState.FIELD_IDENT, luceneIdent, Store.YES));

  222.                     writer.updateDocument(identTerm, doc);
  223.                     counter.inc();
  224.                 }
  225.                 storage.saveAll(toSave);
  226.             }
  227.             gauge.set(count());
  228.             return toSave;
  229.         } catch (IOException e) {
  230.             log.error("Failed to write to index", e);
  231.             throw new TechnicalException("Lucene error", e);
  232.         }
  233.     }

  234.     private ResourceStatus shouldSave(Map<String, Resource> existing, Resource resource) {
  235.         var newest = existing.get(resource.getNavIdent());
  236.         boolean shouldSave = newest == null || newest.getOffset() < resource.getOffset();
  237.         boolean shouldSave2 = newest == null || !newest.convertToResponse().equals(resource.convertToResponse());
  238.         if(shouldSave2 != shouldSave){
  239.             var r1 = newest.convertToResponse();
  240.             var r2 = resource.convertToResponse();
  241.             var o1 = newest.getOffset();
  242.             var o2 = resource.getOffset();
  243.             var eq = newest.convertToResponse().equals(resource.convertToResponse());
  244.             log.info("""
  245.                             Diff on response is not equivalent to difference in offset for navident {}
  246.                             r1: {}
  247.                             r2: {}
  248.                             offs1: {}
  249.                             offs2: {}
  250.                             r1.resp == r2.resp: {}""",
  251.                     resource.getNavIdent(),r1,r2,o1,o2,eq);
  252.         }
  253.         return new ResourceStatus(shouldSave, newest);
  254.     }

  255.     private ResourceStatus shouldSaveOld(Map<String, List<Resource>> existing, Resource resource) {
  256.         var newest = existing.getOrDefault(resource.getNavIdent(), List.of()).stream().max(comparing(Resource::getOffset));
  257.         boolean shouldSave = newest.isEmpty() || newest.get().getOffset() < resource.getOffset();
  258.         return new ResourceStatus(shouldSave, newest.orElse(null));
  259.     }

  260.     private void checkEvents(Resource previous, Resource current) {
  261.         if (!previous.isInactive() && current.isInactive()) {
  262.             log.info("ident {} became inactive, creating ResourceEvent", current.getNavIdent());
  263.             storage.save(ResourceEvent.builder().eventType(EventType.INACTIVE).ident(current.getNavIdent()).build());
  264.         }
  265.     }

  266.     private Map<String, List<Resource>> findResources(List<String> idents) {
  267.         return resourceRepository.findByIdents(idents).stream()
  268.                 .map(GenericStorage::toResource)
  269.                 .collect(groupingBy(Resource::getNavIdent));
  270.     }

  271.     public long count() {
  272.         return ResourceState.count();
  273.     }

  274.     public long countDb() {
  275.         return resourceRepository.count();
  276.     }

  277.     public void clear() {
  278.         ResourceState.clear();
  279.     }

  280.     @Scheduled(initialDelayString = "PT1M", fixedRateString = "PT1M")
  281.     public void metrics() {
  282.         gauge.set(count());
  283.         dbGauge.set(countDb());
  284.     }

  285.     @Scheduled(initialDelayString = "PT10M", fixedRateString = "PT10M")
  286.     public void cleanup() {
  287.         resourceRepository.cleanup();
  288.     }

  289.     private String getIdent(ScoreDoc sd, IndexSearcher searcher) {
  290.         try {
  291.             return searcher.getIndexReader().storedFields().document(sd.doc).get(ResourceState.FIELD_IDENT);
  292.         } catch (Exception e) {
  293.             throw new TechnicalException("io error", e);
  294.         }
  295.     }

  296.     private boolean shouldReturn(String navIdent) {
  297.         Settings settings = settingsService.getSettingsCached();
  298.         // null only for tests
  299.         return settings == null || !settings.isFilteredIdent(navIdent);
  300.     }

  301.     record ResourceStatus(boolean shouldSave, Resource previous) {

  302.     }

  303.     private static class ResourceState {

  304.         static final String FIELD_IDENT = "ident";
  305.         static final String FIELD_NAME_VERBATIM = "name_verbatim";
  306.         static final String FIELD_NAME_NGRAMS = "name_ngrams";
  307.         static final String FIELD_NAME_PHONETIC = "name_phonetic";

  308.         private static final Map<String, Resource> allResources = new HashMap<>(1 << 15);
  309.         private static final Map<String, Resource> allResourcesByMail = new HashMap<>(1 << 15);
  310.         private static Directory index = new ByteBuffersDirectory();
  311.         private static final PerFieldAnalyzerWrapper analyzer;

  312.         static {
  313.             var analyzerPerField = new HashMap<String, Analyzer>();
  314.             analyzerPerField.put(FIELD_NAME_NGRAMS, createNGramAnalyzer());
  315.             analyzerPerField.put(FIELD_NAME_PHONETIC, createMetaphoneAnalyzer());
  316.             analyzer = new PerFieldAnalyzerWrapper(createSimpleIgnoreCaseAnalyzer(), analyzerPerField);
  317.         }

  318.         static Optional<Resource> get(String ident) {
  319.             return Optional.ofNullable(allResources.get(ident.toUpperCase()));
  320.         }

  321.         static List<Resource> findAll(List<String> idents) {
  322.             return allResources.values().stream().filter(r -> idents.contains(r.getNavIdent())).toList();
  323.         }

  324.         static Optional<Resource> getByEmail(String email) {
  325.             return Optional.ofNullable(allResourcesByMail.get(email.toLowerCase()));
  326.         }

  327.         static void put(Resource resource) {
  328.             allResources.put(resource.getNavIdent().toUpperCase(), resource);
  329.             if (resource.getEmail() != null) {
  330.                 allResourcesByMail.put(resource.getEmail().toLowerCase(), resource);
  331.             }
  332.         }

  333.         static int count() {
  334.             return allResources.size();
  335.         }

  336.         static void clear() {
  337.             index = new ByteBuffersDirectory();
  338.             allResources.clear();
  339.             allResourcesByMail.clear();
  340.         }

  341.         @SneakyThrows
  342.         static IndexReader createReader() {
  343.             return DirectoryReader.open(index);
  344.         }

  345.         @SneakyThrows
  346.         static IndexWriter createWriter() {
  347.             IndexWriterConfig writerConfig = new IndexWriterConfig(getAnalyzer());
  348.             return new IndexWriter(index, writerConfig);
  349.         }

  350.         static Analyzer getAnalyzer() {
  351.             return analyzer;
  352.         }

  353.         @SneakyThrows
  354.         private static Analyzer createNGramAnalyzer(){
  355.             return new Analyzer() {
  356.                 @Override
  357.                 protected TokenStreamComponents createComponents(String fieldName) {
  358.                     Tokenizer source = new StandardTokenizer();
  359.                     TokenStream result = new LowerCaseFilter(source);
  360.                     result = new EdgeNGramTokenFilter(result ,3,40,false);
  361.                     return new TokenStreamComponents(source, result);
  362.                 }
  363.             };
  364.         }

  365.         @SneakyThrows
  366.         private static Analyzer createMetaphoneAnalyzer(){
  367.             return new Analyzer() {
  368.                 @Override
  369.                 protected TokenStreamComponents createComponents(String fieldName) {
  370.                     Tokenizer source = new StandardTokenizer();
  371.                     TokenStream result = new LowerCaseFilter(source);
  372.                     result = new DoubleMetaphoneFilter(result ,10,false);
  373.                     return new TokenStreamComponents(source, result);
  374.                 }
  375.             };
  376.         }

  377.         @SneakyThrows
  378.         private static Analyzer createSimpleIgnoreCaseAnalyzer(){
  379.             return new Analyzer() {
  380.                 @Override
  381.                 protected TokenStreamComponents createComponents(String fieldName) {
  382.                     Tokenizer source = new WhitespaceTokenizer();
  383.                     TokenStream result = new LowerCaseFilter(source);
  384.                     result = new ASCIIFoldingFilter(result);
  385.                     return new TokenStreamComponents(source, result);
  386.                 }
  387.             };
  388.         }
  389.     }
  390. }