PowerShell scripts for creating and reading test files with the ...
文章推薦指數: 80 %
WE CAN'T BE SURE THAT THE ENCLOSING FILE WILL HAVE a UTF-8 BOM. # E.G., WHEN DOWNLOADED FROM A Gist (GitHub). # POWERSHELL ITSELF DEFAULTS TO "ANSI" ...
Skiptocontent
Allgists
BacktoGitHub
Signin
Signup
Sign in
Sign up
{{message}}
Instantlysharecode,notes,andsnippets.
mklement0/New-EncodingTestFiles.ps1
LastactiveAug16,2022
Star
3
Fork
2
Star
Code
Revisions
14
Stars
3
Forks
2
Embed
Whatwouldyouliketodo?
Embed
Embedthisgistinyourwebsite.
Share
Copysharablelinkforthisgist.
Clonevia
HTTPS
ClonewithGitorcheckoutwithSVNusingtherepository’swebaddress.
LearnmoreaboutcloneURLs
DownloadZIP
PowerShellscriptsforcreatingandreadingtestfileswiththestandardUnicodecharacterencodingschemesanddefaultencodings.
Raw
New-EncodingTestFiles.ps1
ThisfilecontainsbidirectionalUnicodetextthatmaybeinterpretedorcompileddifferentlythanwhatappearsbelow.Toreview,openthefileinaneditorthatrevealshiddenUnicodecharacters.
LearnmoreaboutbidirectionalUnicodecharacters
Showhiddencharacters
param(
#IMPORTANT:AVOIDNON-ASCIISTRINGLITERALS,BECAUSE
#WECAN'TBESURETHATTHEENCLOSINGFILEWILLHAVEaUTF-8BOM
#E.G.,WHENDOWNLOADEDFROMAGist(GitHub).
#POWERSHELLITSELFDEFAULTSTO"ANSI"ENCODINGWHENREADINGA
#FILEWITHOUTBOM.
#THEFOLLOWINGISTHEEQUIVALENTOF'oö'(lowercase'o'
#(LATINSMALLLETTERO),
#lowercaseumlaut-o(LATINSMALLLETTEROWITHDIAERESIS)
[string]$Text=[char[]](0x6f,0xf6)-join'',
[Alias('Path')]
[string]$LiteralPath='./enc-test'
)
$ErrorActionPreference='Stop'
$VerbosePreference='Continue'
#Instantiateencodersandstoretheminahashtablewhosekeyreflects
#theencodingscheme;thekeysareusedasthefilenames.
#NOTE:UTF-7isnotincluded,becausetheencoderdoesn'toffercreatingaBOM
#andPowerShelldoesn'texpectonewhenreadingwith-EncodingUTF7
#(itreturnsU+FEFFasaliteralchar.)
#"utf7"=New-ObjectSystem.Text.UTF7Encoding$False
#"utf7o"=New-ObjectSystem.Text.UTF7Encoding$True#allowoptionaldirectchars.
#Keylegend:
#B...*with*BOM
#N...*no*BOM
#le,be...littl-endian,big-endian
$htEncs=[ordered]@{
"utf8B"=New-ObjectSystem.Text.UTF8Encoding$True#BOM-or-not
"utf8N"=New-ObjectSystem.Text.UTF8Encoding$False#BOM-or-not
"utf16leB"=New-ObjectSystem.Text.UnicodeEncoding$False,$True#big-or-little-endian,BOM-or-not
"utf16leN"=New-ObjectSystem.Text.UnicodeEncoding$False,$False#big-or-little-endian,BOM-or-not
"utf16beB"=New-ObjectSystem.Text.UnicodeEncoding$True,$True#big-or-little-endian,BOM-or-not
"utf16beN"=New-ObjectSystem.Text.UnicodeEncoding$True,$False#big-or-little-endian,BOM-or-not
"utf32leB"=New-ObjectSystem.Text.UTF32Encoding$False,$True#big-or-little-endian,BOM-or-not
"utf32leN"=New-ObjectSystem.Text.UTF32Encoding$False,$False#big-or-little-endian,BOM-or-not
"utf32beB"=New-ObjectSystem.Text.UTF32Encoding$True,$True#big-or-little-endian,BOM-or-not
"utf32beN"=New-ObjectSystem.Text.UTF32Encoding$True,$False#big-or-little-endian,BOM-or-not
"default"=[System.Text.Encoding]::Default
"sc-default"=$null#UseSet-Content-notethat[System.Text.Encoding]::DefaultisnotthesameinPSCore.
"of-default"=$null#UseOut-File(whichusesUTF-16LE)
}
#Determinefilecontentsforthevariousfiletypes,byfilenameextension.
$htTexts=[ordered]@{
'.txt'=$Text
#Note:Import-CSVrequiresfield-internal"chars.tobeescapedas"",inlinewithRFC4180
'.csv'=@"
Value
"$($Text-replace'"','""')"
"@
'.clixml'=@"
"@
'.psd1'=@"
@{
Value='$($Text-replace"'","''")'
}
"@
}
#Determineoutputpathandcreateoutputdir.ondemand
if(-not(Test-Path$LiteralPath)){#outputdir.doesn'texist,createit
Write-Host-ForegroundColorYellow"OKtocreateoutputdir.?"
if(-not(New-Item-ItemTypeDirectory$LiteralPath-Confirm:$true)){exit1}
}
#Makesurethatthe.NETframeworkusesthesameworkingdir.asPS.
[io.directory]::SetCurrentDirectory($PWD.ProviderPath)
#Createthefiles.
foreach($extin$htTexts.Keys){
foreach($namein$htEncs.Keys){
$enc=$htEncs.$name
$txt=$htTexts.$ext
$fpath=Join-Path$LiteralPath($name+$ext)
write-verbose"Writingto:$fpath"
if($name-eq'sc-default'){#UseSet-Content
Set-Content-Value$txt-NoNewline-LiteralPath$fpath
}elseif($name-eq'of-default'){#UseOut-File
Out-File-InputObject$txt-NoNewline-LiteralPath$fpath
}else{
[io.file]::WriteAllText($fpath,$txt,$enc)
}
}
}
Raw
Read-EncodingTestFiles.ps1
ThisfilecontainsbidirectionalUnicodetextthatmaybeinterpretedorcompileddifferentlythanwhatappearsbelow.Toreview,openthefileinaneditorthatrevealshiddenUnicodecharacters.
LearnmoreaboutbidirectionalUnicodecharacters
Showhiddencharacters
[CmdletBinding()]
param(
#IMPORTANT:AVOIDNON-ASCIISTRINGLITERALS,BECAUSE
#WECAN'TBESURETHATTHEENCLOSINGFILEWILLHAVEaUTF-8BOM
#E.G.,WHENDOWNLOADEDFROMAGist(GitHub).
#POWERSHELLITSELFDEFAULTSTO"ANSI"ENCODINGWHENREADINGA
#FILEWITHOUTBOM.
#THEFOLLOWINGISTHEEQUIVALENTOF'oö'(lowercase'o'
#(LATINSMALLLETTERO),
#lowercaseumlaut-o(LATINSMALLLETTEROWITHDIAERESIS)
[string]$ReferenceText=[char[]](0x6f,0xf6)-join'',#MatchesNew-EncodingTestFiles'sdefault
#Thepath
[string]$LiteralPath='./enc-test'#MatchesNew-EncodingTestFiles'sdefault
)
#ENSURETHATTHISFILEISUTF-8-ENCODED*WITH*ABOM-otherwisePowerShell
#willnotinterpretitcorrectly.
functionGet-CodePointList([string]$Text){
'0x6f0xf6'
.EXAMPLE
>Get-CodePointList'oö'
0x6f0xf6
#>
switch($Text){
$null{'(null)';break}
''{'(empty)';break}
Default{
[string]([int[]]$Text.ToCharArray()|ForEach-Object{'0x{0:x2}'-f$_})
}
}
}
#Note:Thisisalsonecessarytomakethetry/catchhandlerswork.
$ErrorActionPreference='Stop'
#Thefilenameextensionandwhatcmdlet(s)toloadthemwith.
#NotethatImport-PowerShellDataFileandImport-Clixmldonotsupportthe
#-Encodingparameter
$htExts=[ordered]@{
'.txt'=@{cmdletName='Get-Content'},
@{cmdletName='Select-String';fixedParams=@{Pattern=$ReferenceText;SimpleMatch=$true}}
'.csv'=@{cmdletName='Import-Csv'}
'.psd1'=@{cmdletName='Import-PowerShellDataFile'}
'.clixml'=@{cmdletName='Import-Clixml'}
}
#Mapthefilenamerootstothecorresponding-Encodingparametervalues.
$htEncodingNames=@{
utf8='utf8'
utf16le='Unicode'
utf16be='BigEndianUnicode'
utf32le='UTF32'
utf32be='BigEndianUTF32'
'default'='Default'
'sc-default'='Default'
'of-default'='Unicode'
}
#Loopoverallfiletypes
foreach($extin$htExts.Keys){
$cmdDefs=$htExts.$ext
$files=Get-Item-Path"./enc-test/*$ext"
Write-Verbose"=============$ext"
#Readwithandwithout-Encodingparameter.
foreach($cmdDefin$cmdDefs){
$cmd=$cmdDef.cmdletName
$htParams=$cmdDef.fixedParams
if(-not$htParams){$htParams=@{}}
for($pass=1;$pass-le2;++$pass){
#2ndpass:Seeifthecmdletevensupports-Encodingandskip,if
#not.
if($pass-eq2){
if(-not(Get-Command$cmd).Parameters.ContainsKey('Encoding')){
Write-Verbose"==($cmddoesn'thavean-Encodingparameter)"
break
}
}
Write-Verbose"==Using$cmd$(if($pass-eq1){'WITHOUT'}else{'WITHappropriate'})-Encodingparameter:"
$htEncodingParamIfAny=@{}
foreach($filein$files){
$encName='(default)'
if($pass-eq2){
$encName=$htEncodingNames.$($file.BaseName-replace'[NB]$')
$htParams.Encoding=$encName#Set-Encodingargument
}
$exceptionText=''
try{
$content=&$cmd$file.FullName@htParams
}
catch{
$exceptionText="$_"
if($_.Exception.ParameterName-eq'Encoding'){
$result="NOTSUPPORTED:$encName"
}else{
$result="ERROR"
}
}
if($exceptionText){#readingfailed
Write-Verbose"${encName},${cmd}:exceptionoccurred:$exceptionText"
}else{#readingsucceeded,butencodingmaynotbecorrect
#Extractthestringvaluetotestfromthe*object*thatsomeofthe
#cmdletsreturn.
if($content-isnot[string]){
if($cmd-eq'Select-String'){#Thematchedlineisinthe.Lineproperty
$content=$content.Line
}else{#allothershavea.Valueproperty
$content=$content.Value
}
}
Write-Verbose"${encName},${cmd}:value:[$content];bytes:$(Get-CodePointList$content)"
if($null-eq$content){#valuecouldnotberead
$result="NOTHINGREAD"
}else{#makesurethatwasreadmatchesthereferencetextcodepointbycodepoint
$codePoints=[int[]]$content.ToCharArray()
$result=$codePoints.Count-eq2-and$content-eq$ReferenceText
$result=('INCORRECTLYDECODED','ok')[$result]
}
}
[pscustomobject]@{Cmdlet=$cmd;Method=('-Encoding','Auto')[$pass-eq1];FileName=$file.Name;Result=$result}#|Out-Default
}
}#foreach$pass
}#foreach$cmd
}#foreach$ext
Signupforfree
tojointhisconversationonGitHub.
Alreadyhaveanaccount?
Signintocomment
Youcan’tperformthatactionatthistime.
Yousignedinwithanothertaborwindow.Reloadtorefreshyoursession.
Yousignedoutinanothertaborwindow.Reloadtorefreshyoursession.
延伸文章資訊
- 1Byte order mark - Globalization - Microsoft Learn
- 2Changing source files encoding and some fun with PowerShell
Changing source files encoding and some fun with PowerShell ... At least it can correctly read te...
- 3UTF-8 - MDN Web Docs Glossary: Definitions of Web-related terms
- 4Change / Save encoding How to convert several txt files UTF ...
Hello. I need to convert several txt files , located in C:\Folder1 from UTF-8 to UTF-8-BOM. I had...
- 5Read UTF-8 files correctly with PowerShell - Stack Overflow
I need a function that can read any file with UTF-8 encoding, ignore and delete the BOM and not m...