{"id":"https://openalex.org/W4402671394","doi":"https://doi.org/10.18653/v1/2024.acl-srw.10","title":"Document Alignment based on Overlapping Fixed-Length Segments","display_name":"Document Alignment based on Overlapping Fixed-Length Segments","publication_year":2024,"publication_date":"2024-01-01","ids":{"openalex":"https://openalex.org/W4402671394","doi":"https://doi.org/10.18653/v1/2024.acl-srw.10"},"language":"en","primary_location":{"id":"doi:10.18653/v1/2024.acl-srw.10","is_oa":true,"landing_page_url":"http://dx.doi.org/10.18653/v1/2024.acl-srw.10","pdf_url":"https://aclanthology.org/2024.acl-srw.10.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)","raw_type":"proceedings-article"},"type":"article","indexed_in":["crossref"],"open_access":{"is_oa":true,"oa_status":"gold","oa_url":"https://aclanthology.org/2024.acl-srw.10.pdf","any_repository_has_fulltext":null},"authorships":[{"author_position":"first","author":{"id":"https://openalex.org/A5035424799","display_name":"Xiaotian Wang","orcid":"https://orcid.org/0000-0002-2394-9150"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Xiaotian Wang","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"middle","author":{"id":"https://openalex.org/A5066456246","display_name":"Takehito Utsuro","orcid":"https://orcid.org/0000-0003-4072-1833"},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Takehito Utsuro","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]},{"author_position":"last","author":{"id":"https://openalex.org/A5100520423","display_name":"Masaaki Nagata","orcid":null},"institutions":[],"countries":[],"is_corresponding":false,"raw_author_name":"Masaaki Nagata","raw_affiliation_strings":[],"raw_orcid":null,"affiliations":[]}],"institutions":[],"countries_distinct_count":0,"institutions_distinct_count":3,"corresponding_author_ids":[],"corresponding_institution_ids":[],"apc_list":null,"apc_paid":null,"fwci":0.0,"has_fulltext":true,"cited_by_count":0,"citation_normalized_percentile":{"value":0.14208808,"is_in_top_1_percent":false,"is_in_top_10_percent":false},"cited_by_percentile_year":null,"biblio":{"volume":null,"issue":null,"first_page":"103","last_page":"113"},"is_retracted":false,"is_paratext":false,"is_xpac":false,"primary_topic":{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9660999774932861,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},"topics":[{"id":"https://openalex.org/T10601","display_name":"Handwritten Text Recognition Techniques","score":0.9660999774932861,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T10824","display_name":"Image Retrieval and Classification Techniques","score":0.9610999822616577,"subfield":{"id":"https://openalex.org/subfields/1707","display_name":"Computer Vision and Pattern Recognition"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}},{"id":"https://openalex.org/T11550","display_name":"Text and Document Classification Technologies","score":0.9287999868392944,"subfield":{"id":"https://openalex.org/subfields/1702","display_name":"Artificial Intelligence"},"field":{"id":"https://openalex.org/fields/17","display_name":"Computer Science"},"domain":{"id":"https://openalex.org/domains/3","display_name":"Physical Sciences"}}],"keywords":[{"id":"https://openalex.org/keywords/computer-science","display_name":"Computer science","score":0.6100982427597046},{"id":"https://openalex.org/keywords/artificial-intelligence","display_name":"Artificial intelligence","score":0.3381001949310303}],"concepts":[{"id":"https://openalex.org/C41008148","wikidata":"https://www.wikidata.org/wiki/Q21198","display_name":"Computer science","level":0,"score":0.6100982427597046},{"id":"https://openalex.org/C154945302","wikidata":"https://www.wikidata.org/wiki/Q11660","display_name":"Artificial intelligence","level":1,"score":0.3381001949310303}],"mesh":[],"locations_count":1,"locations":[{"id":"doi:10.18653/v1/2024.acl-srw.10","is_oa":true,"landing_page_url":"http://dx.doi.org/10.18653/v1/2024.acl-srw.10","pdf_url":"https://aclanthology.org/2024.acl-srw.10.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)","raw_type":"proceedings-article"}],"best_oa_location":{"id":"doi:10.18653/v1/2024.acl-srw.10","is_oa":true,"landing_page_url":"http://dx.doi.org/10.18653/v1/2024.acl-srw.10","pdf_url":"https://aclanthology.org/2024.acl-srw.10.pdf","source":null,"license":"cc-by","license_id":"https://openalex.org/licenses/cc-by","version":"publishedVersion","is_accepted":true,"is_published":true,"raw_source_name":"Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 4: Student Research Workshop)","raw_type":"proceedings-article"},"sustainable_development_goals":[],"awards":[],"funders":[],"has_content":{"grobid_xml":true,"pdf":true},"content_urls":{"pdf":"https://content.openalex.org/works/W4402671394.pdf","grobid_xml":"https://content.openalex.org/works/W4402671394.grobid-xml"},"referenced_works_count":0,"referenced_works":[],"related_works":["https://openalex.org/W4391375266","https://openalex.org/W2748952813","https://openalex.org/W2390279801","https://openalex.org/W2358668433","https://openalex.org/W4396701345","https://openalex.org/W2376932109","https://openalex.org/W2001405890","https://openalex.org/W4396696052","https://openalex.org/W2382290278","https://openalex.org/W4395014643"],"abstract_inverted_index":{"Acquiring":[0],"large-scale":[1],"parallel":[2],"corpora":[3],"is":[4],"crucial":[5],"for":[6,22,105,143,156],"NLP":[7],"tasks":[8],"such":[9],"as":[10,145,147],"Neural":[11],"Machine":[12],"Translation,":[13],"and":[14,51,57,95,114,123,126,133,184],"web":[15,44,63],"crawling":[16,64],"has":[17],"become":[18],"a":[19,68,73,97,154],"popular":[20],"methodology":[21],"this":[23,108],"purpose.Previous":[24],"studies":[25],"have":[26],"been":[27],"conducted":[28],"based":[29,162],"on":[30,136,148],"sentencebased":[31],"segmentation":[32],"(SBS)":[33],"when":[34,100],"aligning":[35],"documents":[36],"in":[37,62,91,167],"various":[38,160],"languages":[39],"which":[40],"are":[41],"obtained":[42],"through":[43,67],"crawling.Among":[45],"them,":[46],"the":[47,59,84,102,112,137,157],"TK-PERT":[48,121],"method":[49],"(Thompson":[50,122],"Koehn,":[52,124],"2020)":[53],"achieved":[54],"state-of-the-art":[55],"results":[56],"addressed":[58],"boilerplate":[60],"text":[61],"data":[65],"well":[66,146],"down-weighting":[69],"approach.However,":[70],"there":[71],"remains":[72],"problem":[74],"with":[75,175],"how":[76],"to":[77,171],"handle":[78],"long-text":[79],"encoding":[80],"better.Thus,":[81],"we":[82,110],"introduce":[83],"strategy":[85],"of":[86,93],"Overlapping":[87],"Fixed-Length":[88],"Segmentation":[89],"(OFLS)":[90],"place":[92],"SBS,":[94],"observe":[96],"pronounced":[98],"enhancement":[99],"performing":[101],"same":[103],"approach":[104],"document":[106,139,187],"alignment.In":[107],"paper,":[109],"compare":[111],"SBS":[113,161],"OFLS":[115,179],"using":[116],"three":[117],"previous":[118],"methods,":[119],"Mean-Pool,":[120],"2020),":[125,135],"Optimal":[127],"Transport":[128],"(Clark":[129],"et":[130],"al.,":[131],"2019;El-Kishky":[132],"Guzmn,":[134],"WMT16":[138,158],"alignment":[140],"shared":[141],"task":[142],"French-English,":[144],"our":[149],"self-established":[150],"Japanese-English":[151],"dataset":[152],"MnRN.As":[153],"result,":[155],"task,":[159],"methods":[163],"showed":[164],"an":[165],"increase":[166],"recall":[168],"by":[169],"1%":[170],"10%":[172],"after":[173],"reproduction":[174],"OFLS.For":[176],"MnRN":[177],"data,":[178],"demonstrated":[180],"notable":[181],"accuracy":[182],"improvements":[183],"exhibited":[185],"faster":[186],"embedding":[188],"speed.":[189]},"counts_by_year":[],"updated_date":"2026-06-11T09:08:48.828518","created_date":"2025-10-10T00:00:00"}
