Skip to content

#769 Add EBCDIC processor as a library routine #771

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 7 commits into from
Jul 31, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -72,16 +72,55 @@ class Copybook(val ast: CopybookAST) extends Logging with Serializable {
}

/**
* Get the AST object of a field by name.
* Get value of a field of the copybook record by name
*
* Nested field names can contain '.' to identify the exact field.
* If the field name is unique '.' is not required.
*
* @param fieldName A field name
* @param recordBytes Binary encoded data of the record
* @param startOffset An offset where the record starts in the data (in bytes).
* @return The value of the field
*/
def getFieldValueByName(fieldName: String, recordBytes: Array[Byte], startOffset: Int = 0): Any = {
val ast = getFieldByName(fieldName)
ast match {
case s: Primitive => extractPrimitiveField(s, recordBytes, startOffset)
case _ => throw new IllegalStateException(s"$fieldName is not a primitive field, cannot extract its value.")
}
}

/**
* Sets the value of a copybook record field specified by name.
*
* Nested field names can contain '.' to identify the exact field.
* If the field name is unique, '.' is not required.
*
* This method modifies the record in place and does not return a value.
*
* @param fieldName A field name
* @param recordBytes Binary encoded data of the record
* @param value The value to set
* @param startOffset An offset where the record starts in the data (in bytes)
*/
def setFieldValueByName(fieldName: String, recordBytes: Array[Byte], value: Any, startOffset: Int = 0): Unit = {
val ast = getFieldByName(fieldName)
ast match {
case s: Primitive => setPrimitiveField(s, recordBytes, value, startOffset)
case _ => throw new IllegalStateException(s"$fieldName is not a primitive field, cannot set its value.")
}
}

/**
* Get the AST object of a field by name.
*
* Nested field names can contain '.' to identify the exact field.
* If the field name is unique, '.' is not required.
*
* @param fieldName A field name
* @return An AST object of the field. Throws an IllegalStateException if not found of found multiple.
*
*/
@throws(classOf[IllegalArgumentException])
def getFieldByName(fieldName: String): Statement = {

def getFieldByNameInGroup(group: Group, fieldName: String): Seq[Statement] = {
Expand Down Expand Up @@ -171,31 +210,40 @@ class Copybook(val ast: CopybookAST) extends Logging with Serializable {
* @return The value of the field
*
*/
@throws(classOf[Exception])
def extractPrimitiveField(field: Primitive, bytes: Array[Byte], startOffset: Int = 0): Any = {
val slicedBytes = bytes.slice(field.binaryProperties.offset + startOffset, field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize)
field.decodeTypeValue(0, slicedBytes)
}

/**
* Get value of a field of the copybook record by name
* Set value of a field of the copybook record by the AST object of the field
*
* Nested field names can contain '.' to identify the exact field.
* If the field name is unique '.' is not required.
*
* @param fieldName A field name
* @param field The AST object of the field
* @param bytes Binary encoded data of the record
* @param startOffset An offset where the record starts in the data (in bytes).
* @param startOffset An offset to the beginning of the field in the data (in bytes).
* @return The value of the field
*
*/
@throws(classOf[IllegalStateException])
@throws(classOf[Exception])
def getFieldValueByName(fieldName: String, bytes: Array[Byte], startOffset: Int = 0): Any = {
val ast = getFieldByName(fieldName)
ast match {
case s: Primitive => extractPrimitiveField(s, bytes, startOffset)
case _ => throw new IllegalStateException(s"$fieldName is not a primitive field, cannot extract it's value.")
def setPrimitiveField(field: Primitive, recordBytes: Array[Byte], value: Any, startOffset: Int = 0): Unit = {
field.encode match {
case Some(encode) =>
val fieldBytes = encode(value)
val startByte = field.binaryProperties.offset + startOffset
val endByte = field.binaryProperties.offset + startOffset + field.binaryProperties.actualSize

if (startByte < 0 || endByte > recordBytes.length) {
throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the field is out of bounds of the record.")
}
if (fieldBytes.length != field.binaryProperties.dataSize) {
throw new IllegalArgumentException(s"Cannot set value for field '${field.name}' because the encoded value has a different size than the field size.")
}

System.arraycopy(fieldBytes, 0, recordBytes, startByte, fieldBytes.length)
case None =>
throw new IllegalStateException(s"Cannot set value for field '${field.name}' because it does not have an encoder defined.")
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
import za.co.absa.cobrix.cobol.parser.ast.datatype._
import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive}
import za.co.absa.cobrix.cobol.parser.common.Constants
import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
import za.co.absa.cobrix.cobol.parser.decoders.{DecoderSelector, EncoderSelector}
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
import za.co.absa.cobrix.cobol.parser.encoding._
Expand Down Expand Up @@ -855,8 +855,9 @@ class ParserVisitor(enc: Encoding,
Map(),
isDependee = false,
identifier.toUpperCase() == Constants.FILLER,
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision)
) (Some(parent))
DecoderSelector.getDecoder(pic.value, stringTrimmingPolicy, isDisplayAlwaysString, effectiveEbcdicCodePage, effectiveAsciiCharset, isUtf16BigEndian = isUtf16BigEndian, floatingPointFormat, strictSignOverpunch = strictSignOverpunch, improvedNullDetection = improvedNullDetection, strictIntegralPrecision = strictIntegralPrecision),
EncoderSelector.getEncoder(pic.value, effectiveEbcdicCodePage, effectiveAsciiCharset)
)(Some(parent))

parent.children.append(prim)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,19 +17,26 @@
package za.co.absa.cobrix.cobol.parser.ast

import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType, Decimal, Integral}
import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector}
import za.co.absa.cobrix.cobol.parser.decoders.{BinaryUtils, DecoderSelector, EncoderSelector}

/** An abstraction of the statements describing fields of primitive data types in the COBOL copybook
*
* @param level A level for the statement
* @param name An identifier
* @param originalName Original name of the AST element (before the conversion to the Spark-compatible name)
* @param lineNumber An line number in the copybook
* @param redefines A name of a field which is redefined by this one
* @param occurs The number of elements in an fixed size array / minimum items in variable-sized array
* @param to The maximum number of items in a variable size array
* @param dependingOn A field which specifies size of the array in a record
* @param parent A parent node
* @param level A level for the statement
* @param name An identifier
* @param originalName Original name of the AST element (before the conversion to the Spark-compatible name)
* @param lineNumber An line number in the copybook
* @param redefines A name of a field which is redefined by this one
* @param isRedefined A flag indicating if the field is redefined
* @param occurs The number of elements in an fixed size array / minimum items in variable-sized array
* @param to The maximum number of items in a variable size array
* @param dependingOn A field which specifies size of the array in a record
* @param dependingOnHandlers A map of handlers for the dependingOn field
* @param isDependee A flag indicating if the field is a dependee
* @param isFiller A flag indicating if the field is a filler
* @param decode A decoder for the field to convert from raw data to a JVM data type
* @param encode An optional encoder for the field to convert from a JVM data type to raw data
* @param binaryProperties Binary properties of the field, such as size in bits, alignment, etc.
* @param parent A parent node
*/
case class Primitive(
level: Int,
Expand All @@ -46,6 +53,7 @@ case class Primitive(
isDependee: Boolean = false,
isFiller: Boolean = false,
decode: DecoderSelector.Decoder,
encode: Option[EncoderSelector.Encoder],
binaryProperties: BinaryProperties = BinaryProperties(0, 0, 0)
)
(val parent: Option[Group] = None)
Expand Down Expand Up @@ -100,7 +108,6 @@ case class Primitive(
* @param itOffset An offset of the field inside the binary data
* @param record A record in a binary format represented as a vector of bits
*/
@throws(classOf[Exception])
def decodeTypeValue(itOffset: Int, record: Array[Byte]): Any = {
val bytesCount = binaryProperties.dataSize
val idx = itOffset
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ import za.co.absa.cobrix.cobol.parser.CopybookParser.CopybookAST
import za.co.absa.cobrix.cobol.parser.ast.datatype.AlphaNumeric
import za.co.absa.cobrix.cobol.parser.ast.{Group, Primitive, Statement}
import za.co.absa.cobrix.cobol.parser.common.Constants
import za.co.absa.cobrix.cobol.parser.decoders.DecoderSelector
import za.co.absa.cobrix.cobol.parser.decoders.{DecoderSelector, EncoderSelector}
import za.co.absa.cobrix.cobol.parser.decoders.FloatingPointFormat.FloatingPointFormat
import za.co.absa.cobrix.cobol.parser.encoding.Encoding
import za.co.absa.cobrix.cobol.parser.encoding.codepage.CodePage
Expand Down Expand Up @@ -74,13 +74,15 @@ class NonTerminalsAdder(
val sz = g.binaryProperties.actualSize
val dataType = AlphaNumeric(s"X($sz)", sz, enc = Some(enc))
val decode = DecoderSelector.getDecoder(dataType, stringTrimmingPolicy, isDisplayAlwaysString = false, ebcdicCodePage, asciiCharset, isUtf16BigEndian, floatingPointFormat, strictSignOverpunch, improvedNullDetection)
val encode = EncoderSelector.getEncoder(dataType, ebcdicCodePage, asciiCharset)
val newName = getNonTerminalName(g.name, g.parent.get)
newChildren.append(
Primitive(
g.level, newName, "", g.lineNumber,
dataType,
redefines = Some(g.name),
decode = decode,
encode = encode,
binaryProperties = g.binaryProperties
)(g.parent)
)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.cobrix.cobol.parser.decoders

import za.co.absa.cobrix.cobol.parser.ast.datatype.{AlphaNumeric, CobolType}
import za.co.absa.cobrix.cobol.parser.encoding.codepage.{CodePage, CodePageCommon}
import za.co.absa.cobrix.cobol.parser.encoding.{ASCII, EBCDIC, Encoding}

import java.nio.charset.{Charset, StandardCharsets}

object EncoderSelector {
type Encoder = Any => Array[Byte]

def getEncoder(dataType: CobolType,
ebcdicCodePage: CodePage = new CodePageCommon,
asciiCharset: Charset = StandardCharsets.US_ASCII): Option[Encoder] = {
dataType match {
case alphaNumeric: AlphaNumeric if alphaNumeric.compact.isEmpty =>
getStringEncoder(alphaNumeric.enc.getOrElse(EBCDIC), ebcdicCodePage, asciiCharset, alphaNumeric.length)
case _ =>
None
}
}

/** Gets a decoder function for a string data type. Encoder is chosen depending on whether input encoding is EBCDIC or ASCII */
private def getStringEncoder(encoding: Encoding,
ebcdicCodePage: CodePage,
asciiCharset: Charset,
fieldLength: Int
): Option[Encoder] = {
encoding match {
case EBCDIC =>
val encoder = (a: Any) => {
encodeEbcdicString(a.toString, CodePageCommon.asciiToEbcdicMapping, fieldLength)
}
Option(encoder)
case ASCII =>
None
case _ =>
None
}
}

/**
* An encoder from a ASCII basic string to an EBCDIC byte array
*
* @param string An input string
* @param conversionTable A conversion table to use to convert from ASCII to EBCDIC
* @param length The length of the output (in bytes)
* @return A string representation of the binary data
*/
def encodeEbcdicString(string: String, conversionTable: Array[Byte], length: Int): Array[Byte] = {
require(length >= 0, s"Field length cannot be negative, got $length")

var i = 0
val buf = new Array[Byte](length)

while (i < string.length && i < length) {
val asciiByte = string(i).toByte
buf(i) = conversionTable((asciiByte + 256) % 256)
i = i + 1
}
buf
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,28 @@ object CodePageCommon {
}
ebcdic2ascii
}

/**
* This is the table for converting basic ASCII symbols to EBCDIC common code page
*/
def asciiToEbcdicMapping: Array[Byte] = {
Array[Byte](
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x0D.toByte, 0x00.toByte, 0x00.toByte, 0x25.toByte, 0x00.toByte, 0x00.toByte, // 0 - 15
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 16 - 31
0x40.toByte, 0x5A.toByte, 0x7F.toByte, 0x7B.toByte, 0x5B.toByte, 0x6C.toByte, 0x50.toByte, 0x7D.toByte, 0x4D.toByte, 0x5D.toByte, 0x5C.toByte, 0x4E.toByte, 0x6B.toByte, 0x60.toByte, 0x4B.toByte, 0x61.toByte, // 32 - 47
0xF0.toByte, 0xF1.toByte, 0xF2.toByte, 0xF3.toByte, 0xF4.toByte, 0xF5.toByte, 0xF6.toByte, 0xF7.toByte, 0xF8.toByte, 0xF9.toByte, 0x7A.toByte, 0x5E.toByte, 0x4C.toByte, 0x7E.toByte, 0x6E.toByte, 0x6F.toByte, // 48 - 63
0x7C.toByte, 0xC1.toByte, 0xC2.toByte, 0xC3.toByte, 0xC4.toByte, 0xC5.toByte, 0xC6.toByte, 0xC7.toByte, 0xC8.toByte, 0xC9.toByte, 0xD1.toByte, 0xD2.toByte, 0xD3.toByte, 0xD4.toByte, 0xD5.toByte, 0xD6.toByte, // 64 - 79
0xD7.toByte, 0xD8.toByte, 0xD9.toByte, 0xE2.toByte, 0xE3.toByte, 0xE4.toByte, 0xE5.toByte, 0xE6.toByte, 0xE7.toByte, 0xE8.toByte, 0xE9.toByte, 0xBA.toByte, 0xE0.toByte, 0xBB.toByte, 0xB0.toByte, 0x6D.toByte, // 80 - 95
0x79.toByte, 0x81.toByte, 0x82.toByte, 0x83.toByte, 0x84.toByte, 0x85.toByte, 0x86.toByte, 0x87.toByte, 0x88.toByte, 0x89.toByte, 0x91.toByte, 0x92.toByte, 0x93.toByte, 0x94.toByte, 0x95.toByte, 0x96.toByte, // 96 - 111
0x97.toByte, 0x98.toByte, 0x99.toByte, 0xA2.toByte, 0xA3.toByte, 0xA4.toByte, 0xA5.toByte, 0xA6.toByte, 0xA7.toByte, 0xA8.toByte, 0xA9.toByte, 0xC0.toByte, 0x6A.toByte, 0xD0.toByte, 0xA1.toByte, 0x00.toByte, // 112 - 127
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 128 - 143
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 144 - 159
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 160 - 175
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 176 - 191
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 192 - 207
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 208 - 223
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, // 224 - 239
0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte, 0x00.toByte // 240 - 255
)
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
/*
* Copyright 2018 ABSA Group Limited
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package za.co.absa.cobrix.cobol.processor

import za.co.absa.cobrix.cobol.parser.ast.Group
import za.co.absa.cobrix.cobol.reader.extractors.record.RecordHandler

/**
* A handler for processing COBOL records and mapping it to JVM data structures.
*
* This implementation uses an array to group data fields of struct fields.
*/
class ArrayOfAnyHandler extends RecordHandler[scala.Array[Any]] {
override def create(values: Array[Any], group: Group): Array[Any] = values

override def toSeq(record: Array[Any]): Seq[Any] = record.toSeq

override def foreach(record: Array[Any])(f: Any => Unit): Unit = record.foreach(f)
}
Loading
Loading