Using HttpRequest in Excel to Convert PDF to Word Document

This article mainly discusses how to construct a POST request when uploading files using WebKitFormBoundary in VBA, as processing PDF document conversion on the web is quite convenient. The interfaces for document conversion and status checking (it is uncertain whether the status checking code is correct) need to be examined in the browser’s developer tools by viewing the JavaScript code and setting breakpoints to see the detailed parameters more clearly. This method of calling HttpRequest is somewhat unstable; sometimes all return information is correct, but the file cannot be downloaded. Traditional Chinese file names convert without issues on the web, but when using HttpRequest, the returned file address is garbled, and it is unclear where the problem lies.Using HttpRequest in Excel to Convert PDF to Word Document

Public Declare Function URLDownloadToFile Lib "urlmon" Alias "URLDownloadToFileA" (ByVal pCaller As Integer, ByVal szURL As String, ByVal szFileName As String, ByVal dwReserved As Integer, ByVal lpfnCB As Integer) As Long
Sub PDF2Docx()    Dim json As Object    Dim http As MSXML2.ServerXMLHTTP    Dim bData() As Byte    Dim strPath As String    Dim FILENAME As String    Dim strURL As String    Dim strIndex As String    Dim strFileURL As String    Dim strStatus As String    Dim strDownURL As String    Dim strIndex2 As String    Dim part As String    Dim ado As Object    Dim image    Dim i As Integer    Dim bCon As Boolean
    If Len(Cells(1, 2)) = 0 Then        MsgBox "File cannot be empty", vbInformation        Exit Sub    End If
    If Right(Cells(1, 2), 4) <> ".pdf" Then        MsgBox "Must be a PDF file", vbInformation        Exit Sub    End If
    strPath = Cells(1, 2)    FILENAME = Mid(strPath, InStrRev(strPath, "\") + 1)    Set http = New MSXML2.ServerXMLHTTP
    strURL = "https://www.cleverpdf.com/pdf/uploadFiles"
    BOUNDARY = "WebKitFormBoundary" & Random16()    part = "------" & BOUNDARY & vbCrLf    part = part & "Content-Disposition: form-data; name=\"files\"" & vbCrLf    part = part & vbCrLf & vbCrLf
    part = part & "------" & BOUNDARY & vbCrLf    part = part & "Content-Disposition: form-data; name=\"files\"; filename=\"" & FILENAME & "\"" & vbCrLf    part = part & "Content-Type: " & "Content-Type: application/pdf" & vbCrLf & vbCrLf
    Set ado = CreateObject("ADODB.Stream")    ado.Type = 1 'binary    ado.Open    ado.LoadFromFile strPath    ado.Position = 0    image = ado.read    ado.Close
    ado.Open    ado.Position = 0    ado.Type = 1 ' binary    ado.Write ToBytes(part)    ado.Write image    ado.Write ToBytes(vbCrLf & "------" & BOUNDARY & "--")    ado.Position = 0
    Application.StatusBar = "Uploading file..."
    http.Open "POST", strURL, False    http.setRequestHeader "Content-Type", "multipart/form-data; boundary=----" & BOUNDARY    http.setRequestHeader "Referer", "https://www.cleverpdf.com/cn/pdf-to-word"    http.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"    http.send ado.read
    ado.Close
    http.waitForResponse
    If http.Status = 200 Then        Set json = JsonConverter.ParseJson(http.responseText)
        strIndex = json("index")        strFileURL = json("url")
        strStatus = json("STATUS")
        strFileURL = "uploadFiles/file/" & strIndex & "/" & FILENAME        BOUNDARY = "WebKitFormBoundary" & Random16()
        part = "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"url\"" & vbCrLf & vbCrLf        part = part & strFileURL & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"index\"" & vbCrLf & vbCrLf        part = part & strIndex & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"pid\"" & vbCrLf & vbCrLf        part = part & "1" & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"oid\"" & vbCrLf & vbCrLf        part = part & "1" & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"status\"" & vbCrLf & vbCrLf        part = part & strStatus & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"pwd\"" & vbCrLf & vbCrLf        part = part & vbCrLf
        part = part & "------" & BOUNDARY & vbCrLf        part = part & "Content-Disposition: form-data; name=\"formatv1\"" & vbCrLf & vbCrLf        part = part & "1" & vbCrLf
        part = part & "------" & BOUNDARY & "--" & vbCrLf
        part = StrConv(part, vbFromUnicode)
        Application.StatusBar = "Converting file..."
        http.Open "POST", "https://www.cleverpdf.com/pdf/doProcess.do", False        http.setRequestHeader "Content-Type", "multipart/form-data; boundary=----" & BOUNDARY
        ReDim bData(LenB(part))        bData = part
        http.setRequestHeader "Content-Length", UBound(bData)
        http.setRequestHeader "Referer", "https://www.cleverpdf.com/cn/pdf-to-word"        http.setRequestHeader "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/135.0.0.0 Safari/537.36"
        http.send bData        http.waitForResponse
        waitsec 5        strIndex2 = ""        If http.Status = 200 Then            Set json = JsonConverter.ParseJson(http.responseText)            strIndex2 = json("index")            bCon = False            For i = 1 To 3                http.Open "GET", "https://www.cleverpdf.com/pdf/fetchStatus?index=" & strIndex2, False                http.send ""                http.waitForResponse
                If InStr(1, http.responseText, "\"result\":0") > 0 Then                    bCon = True                    Exit For                End If                waitsec 5            Next i            If bCon = True Then                strDownURL = "https://www.cleverpdf.com/" & strIndex2 & "/" & Replace(FILENAME, ".pdf", ".docx")                Debug.Print strDownURL                Cells(3, 2) = downloadFile(strDownURL, Left(strPath, Len(strPath) - 4) & ".docx")            End If        End If
        ' Delete generated file        If Len(strIndex2) > 0 Then            strDownURL = "https://www.cleverpdf.com/pdf/removeFiles?path=" & strIndex2 & "%5C" & encodeURI(Replace(FILENAME, ".pdf", ".docx"))            http.Open "GET", strDownURL, False            http.setRequestHeader "Referer", "https://www.cleverpdf.com/cn/pdf-to-word"            http.send ""            http.waitForResponse            Debug.Print http.responseText        End If
        ' Delete original file        http.Open "GET", "https://www.cleverpdf.com/pdf/removeFiles?path=" & Replace(encodeURI(strFileURL), "/", "%2F"), False        http.send        http.waitForResponse        Debug.Print http.responseText    End If    Application.StatusBar = FalseEnd Sub
Function Random16() As String    Dim characters As String    Dim result As String    Dim i As Integer    Dim randomIndex As Integer
    characters = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"
    result = ""
    For i = 1 To 16        randomIndex = Int((Len(characters) * Rnd) + 1)        result = result & Mid(characters, randomIndex, 1)    Next i    Random16 = resultEnd Function
Function downloadFile(strDownURL As String, strLocalURL As String) As String    Dim iDL As Long    'strDownURL = encodeURI(strDownURL)    iDL = URLDownloadToFile(0, strDownURL, strLocalURL, 0, 0)    If iDL = 0 Then        downloadFile = "Document converted and downloaded"    Else        downloadFile = "Error: Conversion failed or document download failed"    End IfEnd Function
Private Sub waitsec(ByVal dS As Double)    Dim sTimer As Date    sTimer = Timer    Do        DoEvents    Loop While Format((Timer - sTimer), "0.00") < dSEnd Sub
Function decodeURI(ByVal strText As String) As String    Dim JS As Object    Set JS = CreateObject("ScriptControl")    JS.Language = "JavaScript"    decodeURI = JS.Eval("decodeURI('" & Replace(strText, "'", "\'") & "');")End Function
Function encodeURI(ByVal strText As String) As String    Dim JS As Object    Set JS = CreateObject("ScriptControl")    JS.Language = "JavaScript"    encodeURI = JS.Eval("encodeURI('" & Replace(strText, "'", "\'") & "');")End Function
Function ToBytes(str As String) As Variant
    Dim ado As Object    Set ado = CreateObject("ADODB.Stream")    ado.Open    ado.Type = 2 ' text    ado.Charset = "_autodetect"    ado.WriteText str    ado.Position = 0    ado.Type = 1    ToBytes = ado.read    ado.Close
End Function

Source file address:https://www.alipan.com/s/omR9gxBaQtj This uses the CleverPDF web interface; there are many other free tools for PDF conversion, such as the Python pdf2docx library. Additionally, I recommend an excellent open-source PDF tool, StirlingPDF, which has both PC and web versions, but the Word conversion uses LibreOffice, and during use, I found some missing characters.

Leave a Comment