{"id":"https://openalex.org/W4376122831","doi":"https://doi.org/10.48550/arxiv.2305.05271","title":"Robust Acoustic and Semantic Contextual Biasing in Neural Transducers for Speech Recognition","display_name":"Robust Acoustic and Semantic Contextual Biasing in Neural Transducers for Speech Recognition","publication_year":2023,"publication_date":"2023-05-09","ids":{"openalex":"https://openalex.org/W4376122831","doi":"https://doi.org/10.48550/arxiv.2305.05271"},"language":"en","primary_location":{"id":"pmh:oai:arXiv.org:2305.05271","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.05271","pdf_url":"https://arxiv.org/pdf/2305.05271","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"type":"preprint","indexed_in":["arxiv","datacite"],"open_access":{"is_oa":true,"oa_status":"green","oa_url":"https://arxiv.org/pdf/2305.05271","any_repository_has_fulltext":true},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5055157490","display_name":"Xuandi Fu","orcid":"https://orcid.org/0000-0002-9922-6856"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Fu, Xuandi","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5084035667","display_name":"Kanthashree Mysore Sathyendra","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Sathyendra, Kanthashree Mysore","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5014054572","display_name":"Ankur Gandhe","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Gandhe, Ankur","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5100375136","display_name":"Jing Liu","orcid":"https://orcid.org/0000-0003-4690-1886"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Liu, Jing","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5023708721","display_name":"Grant P. Strimel","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Strimel, Grant P.","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5007036768","display_name":"Ross McGowan","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"McGowan, Ross","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5029573067","display_name":"Athanasios Mouchtaris","orcid":"https://orcid.org/0000-0001-7583-0189"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Mouchtaris, Athanasios","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":0,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":null,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":null,"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":null,"last_page":null},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10201","display_name":"Speech Recognition and Synthesis","score":0.9998000264167786,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10860","display_name":"Speech and Audio Processing","score":0.9926999807357788,"subfield":{"id":"https://openalex.org/subfields/1711","display_name":"Signal Processing"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10028","display_name":"Topic Modeling","score":0.9883000254631042,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/utterance","display_name":"Utterance","score":0.7553668022155762},{"id":"https://openalex.org/keywords/biasing","display_name":"Biasing","score":0.6970857977867126},{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6921648979187012},{"id":"https://openalex.org/keywords/context","display_name":"Context (archaeology)","score":0.5989459753036499},{"id":"https://openalex.org/keywords/speech-recognition","display_name":"Speech recognition","score":0.5752165913581848},{"id":"https://openalex.org/keywords/pronunciation","display_name":"Pronunciation","score":0.5195537209510803},{"id":"https://openalex.org/keywords/encoder","display_name":"Encoder","score":0.4741743803024292},{"id":"https://openalex.org/keywords/semantic-similarity","display_name":"Semantic similarity","score":0.41057950258255005},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.34845566749572754},{"id":"https://openalex.org/keywords/natural-language-processing","display_name":"Natural language processing","score":0.32216498255729675},{"id":"https://openalex.org/keywords/linguistics","display_name":"Linguistics","score":0.17451685667037964},{"id":"https://openalex.org/keywords/voltage","display_name":"Voltage","score":0.08803823590278625},{"id":"https://openalex.org/keywords/engineering","display_name":"Engineering","score":0.06472310423851013}],"concepts":[{"id":"https://openalex.org/C2775852435","wikidata":"https://www.wikidata.org/wiki/Q258403","display_name":"Utterance","level":2,"score":0.7553668022155762},{"id":"https://openalex.org/C20254490","wikidata":"https://www.wikidata.org/wiki/Q719550","display_name":"Biasing","level":3,"score":0.6970857977867126},{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6921648979187012},{"id":"https://openalex.org/C2779343474","wikidata":"https://www.wikidata.org/wiki/Q3109175","display_name":"Context (archaeology)","level":2,"score":0.5989459753036499},{"id":"https://openalex.org/C28490314","wikidata":"https://www.wikidata.org/wiki/Q189436","display_name":"Speech recognition","level":1,"score":0.5752165913581848},{"id":"https://openalex.org/C2780844864","wikidata":"https://www.wikidata.org/wiki/Q184377","display_name":"Pronunciation","level":2,"score":0.5195537209510803},{"id":"https://openalex.org/C118505674","wikidata":"https://www.wikidata.org/wiki/Q42586063","display_name":"Encoder","level":2,"score":0.4741743803024292},{"id":"https://openalex.org/C130318100","wikidata":"https://www.wikidata.org/wiki/Q2268914","display_name":"Semantic similarity","level":2,"score":0.41057950258255005},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.34845566749572754},{"id":"https://openalex.org/C204321447","wikidata":"https://www.wikidata.org/wiki/Q30642","display_name":"Natural language processing","level":1,"score":0.32216498255729675},{"id":"https://openalex.org/C41895202","wikidata":"https://www.wikidata.org/wiki/Q8162","display_name":"Linguistics","level":1,"score":0.17451685667037964},{"id":"https://openalex.org/C165801399","wikidata":"https://www.wikidata.org/wiki/Q25428","display_name":"Voltage","level":2,"score":0.08803823590278625},{"id":"https://openalex.org/C127413603","wikidata":"https://www.wikidata.org/wiki/Q11023","display_name":"Engineering","level":0,"score":0.06472310423851013},{"id":"https://openalex.org/C111919701","wikidata":"https://www.wikidata.org/wiki/Q9135","display_name":"Operating system","level":1,"score":0.0},{"id":"https://openalex.org/C151730666","wikidata":"https://www.wikidata.org/wiki/Q7205","display_name":"Paleontology","level":1,"score":0.0},{"id":"https://openalex.org/C86803240","wikidata":"https://www.wikidata.org/wiki/Q420","display_name":"Biology","level":0,"score":0.0},{"id":"https://openalex.org/C138885662","wikidata":"https://www.wikidata.org/wiki/Q5891","display_name":"Philosophy","level":0,"score":0.0},{"id":"https://openalex.org/C119599485","wikidata":"https://www.wikidata.org/wiki/Q43035","display_name":"Electrical engineering","level":1,"score":0.0}],"mesh":[],"locations_count":2,"locations":[{"id":"pmh:oai:arXiv.org:2305.05271","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.05271","pdf_url":"https://arxiv.org/pdf/2305.05271","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},{"id":"doi:10.48550/arxiv.2305.05271","is_oa":true,"landing_page_url":"https://doi.org/10.48550/arxiv.2305.05271","pdf_url":null,"source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":null,"is_accepted":false,"is_published":null,"raw_source_name":null,"raw_type":"Preprint"}],"best_oa_location":{"id":"pmh:oai:arXiv.org:2305.05271","is_oa":true,"landing_page_url":"http://arxiv.org/abs/2305.05271","pdf_url":"https://arxiv.org/pdf/2305.05271","source":{"id":"https://openalex.org/S4306400194","display_name":"arXiv (Cornell University)","issn_l":null,"issn":null,"is_oa":true,"is_in_doaj":false,"is_core":false,"host_organization":"https://openalex.org/I205783295","host_organization_name":"Cornell University","host_organization_lineage":["https://openalex.org/I205783295"],"host_organization_lineage_names":[],"type":"repository"},"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"submittedVersion","is_accepted":false,"is_published":false,"raw_source_name":"","raw_type":"text"},"sustainable_development_goals":[{"display_name":"Quality Education","id":"https://metadata.un.org/sdg/4","score":0.6000000238418579}],"awards":[],"funders":[],"has_content":{"pdf":true,"grobid_xml":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4376122831.pdf","grobid_xml":"https://content.openalex.org/works/W4376122831.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W2183593636","https://openalex.org/W2529301793","https://openalex.org/W2350724007","https://openalex.org/W2384121599","https://openalex.org/W2355751417","https://openalex.org/W2038083449","https://openalex.org/W2423284978","https://openalex.org/W2083922162","https://openalex.org/W2000075989","https://openalex.org/W4220683390"],"abstract_inverted_index":{"Attention-based":[0],"contextual":[1,37,95,106,129,169],"biasing":[2,73,96,133,163,178],"approaches":[3,28,46],"have":[4],"shown":[5],"significant":[6],"improvements":[7,200,211],"in":[8,16],"the":[9,33,43,54,102,105,123,136,150,167,199],"recognition":[10],"of":[11],"generic":[12],"and/or":[13],"personal":[14],"rare-words":[15],"End-to-End":[17],"Automatic":[18],"Speech":[19],"Recognition":[20],"(E2E":[21],"ASR)":[22],"systems":[23],"like":[24],"neural":[25,115],"transducers.":[26],"These":[27],"employ":[29],"cross-attention":[30],"to":[31,42,64,83,88,93,121,131,192],"bias":[32,55],"model":[34,117,148,170],"towards":[35],"specific":[36],"entities":[38,107,130],"injected":[39],"as":[40],"bias-phrases":[41],"model.":[44,195],"Prior":[45],"typically":[47],"relied":[48],"on":[49,75,149,161,212],"subword":[50,58],"encoders":[51,120],"for":[52,72],"encoding":[53],"phrases.":[56],"However,":[57],"tokenizations":[59],"are":[60,201],"coarse":[61],"and":[62,104,176,207,216],"fail":[63],"capture":[65],"granular":[66],"pronunciation":[67,91],"information":[68],"which":[69],"is":[70],"crucial":[71],"based":[74,119],"acoustic":[76,99,109,175],"similarity.":[77],"In":[78],"this":[79],"work,":[80],"we":[81,185],"propose":[82],"use":[84],"lightweight":[85],"character":[86],"representations":[87],"encode":[89,122],"fine-grained":[90],"features":[92],"improve":[94],"guided":[97],"by":[98,135],"similarity":[100],"between":[101],"audio":[103],"(termed":[108,140],"biasing).":[110,142],"We":[111],"further":[112],"integrate":[113],"pretrained":[114],"language":[116],"(NLM)":[118],"utterance's":[124,137],"semantic":[125,138,141,177],"context":[126,139],"along":[127],"with":[128,205],"perform":[132],"informed":[134],"Experiments":[143],"using":[144],"a":[145,154,181],"Conformer":[146],"Transducer":[147],"Librispeech":[151,213],"dataset":[152],"show":[153],"4.62%":[155],"-":[156],"9.26%":[157],"relative":[158,188,209],"WER":[159,189,210],"improvement":[160,190],"different":[162],"list":[164],"sizes":[165],"over":[166],"baseline":[168,194],"when":[171],"incorporating":[172],"our":[173,193],"proposed":[174],"approach.":[179],"On":[180,196],"large-scale":[182],"in-house":[183,218],"dataset,":[184],"observe":[186],"7.91%":[187],"compared":[191],"tail":[197],"utterances,":[198],"even":[202],"more":[203],"pronounced":[204],"36.80%":[206],"23.40%":[208],"rare":[214],"words":[215],"an":[217],"testset":[219],"respectively.":[220]},"counts_by_year":[],"updated_date":"2026-07-01T06:00:48.157686","created_date":"2025-10-10T00:00:00"}
