ExcelBanter

ExcelBanter (https://www.excelbanter.com/)
-   Excel Programming (https://www.excelbanter.com/excel-programming/)
-   -   Getting <Title out of HTML source code (https://www.excelbanter.com/excel-programming/409914-getting-title-out-html-source-code.html)

Mark[_66_]

Getting <Title out of HTML source code
 
Hello

I was hoping to get some tips on how to get what is in between
<Title....</Title tags in a text file containing HTML source code
into column 3. The VBA code searches through many folders and
subfolders finding any file that is .html and reading it. I tried
parsing but can't seem to get it to work. Any ideas? Here is the code
that I have (that doesn't work)

Sub CheckTextFilesForHREFs()
MsgBox "Press OK to begin report"
Dim WholeLine As String
Dim myPath As String
Dim workfile As String
Dim myR As Long

myPath = "C:\Exelon\"
workfile = Dir(myPath & "*.html")
'sLine = WholeLine

Set fs = Application.FileSearch
With fs
..LookIn = "C:\Exelon"
..Filename = ".html"
..SearchSubFolders = True
'.FileType = mosFileTypeAllFiles
If .Execute(SortBy:=msoSortByFileName, _
SortOrder:=msoSortOrderAscending) 0 Then
MsgBox "There were " & .FoundFiles.Count & _
" file(s) found."
For i = 1 To .FoundFiles.Count
ParseURL .FoundFiles(i)
ParseTitle .FoundFiles(i)
ParseLink .FoundFiles(i)
Next i

Else
MsgBox "There were no files found."
End If
End With

Sub ParseTitle(strFile As String)
Dim strTxt As String, lngTxt As Long, i As Long, oMatches
Dim ws As Worksheet, j As Long, k As Long, m As Long, oMatches2
Dim reg, oMatches3, reg2
i = FreeFile
'strFile = "c:\Users\Richard\Documents\Htmltest.html"
lngTxt = FileLen(strFile)
strTxt = Space(lngTxt)
Open strFile For Binary Access Read As #i
Get #i, , strTxt
Close #i
Debug.Print strTxt
With CreateObject("vbscript.regexp")
..Global = True
..ignorecase = True
..Pattern = vbCrLf & ".*?title.*?(?=" & vbCrLf & ")"
If .test(strTxt) Then
Set oMatches = .Execute(strTxt)
For i = 0 To oMatches.Count - 1
Set reg = CreateObject("vbscript.regexp")
With reg
..Global = True
..ignorecase = True
..Pattern = "<title\""(.*?)\"""
k = Cells(Rows.Count, 1).End(xlUp).Offset(1).Row
Cells(k, 1).Value = strFile
If .test(oMatches(i)) Then
Set oMatches2 = .Execute(oMatches(i))
For j = 0 To oMatches2.Count - 1
Cells(k, j + 3) = .Replace(oMatches2(j), "$1")
Next j
End If
End With
Next i
End If
End With
End Sub

Rick Rothstein \(MVP - VB\)[_1803_]

Getting <Title out of HTML source code
 
I'm not all that familiar with HTML source code... can there be more than
one <title, </title pair in a file? If no, then this function will return
that single title...

Function GetTitle(TextToSearch As String) As String
GetTitle = Split(Split(TextToSearch, "<title", , vbTextCompare)(1), _
"</title", , vbTextCompare)(0)
End Function

If, on the other hand, there can be more than one, then this function will
return a zero-based array containing all the found titles...

Function GetTitle(TextToSearch As String) As String()
Dim X As Long
Dim Titles() As String
Titles = Split(TextToSearch, "<title", , vbTextCompare)
For X = 1 To UBound(Titles)
Titles(X - 1) = Split(Titles(X), "</title", , vbTextCompare)(0)
Next
ReDim Preserve Titles(UBound(Titles) - 1)
GetTitle = Titles
End Function

Rick



"Mark" wrote in message
...
Hello

I was hoping to get some tips on how to get what is in between
<Title....</Title tags in a text file containing HTML source code
into column 3. The VBA code searches through many folders and
subfolders finding any file that is .html and reading it. I tried
parsing but can't seem to get it to work. Any ideas? Here is the code
that I have (that doesn't work)

Sub CheckTextFilesForHREFs()
MsgBox "Press OK to begin report"
Dim WholeLine As String
Dim myPath As String
Dim workfile As String
Dim myR As Long

myPath = "C:\Exelon\"
workfile = Dir(myPath & "*.html")
'sLine = WholeLine

Set fs = Application.FileSearch
With fs
.LookIn = "C:\Exelon"
.Filename = ".html"
.SearchSubFolders = True
'.FileType = mosFileTypeAllFiles
If .Execute(SortBy:=msoSortByFileName, _
SortOrder:=msoSortOrderAscending) 0 Then
MsgBox "There were " & .FoundFiles.Count & _
" file(s) found."
For i = 1 To .FoundFiles.Count
ParseURL .FoundFiles(i)
ParseTitle .FoundFiles(i)
ParseLink .FoundFiles(i)
Next i

Else
MsgBox "There were no files found."
End If
End With

Sub ParseTitle(strFile As String)
Dim strTxt As String, lngTxt As Long, i As Long, oMatches
Dim ws As Worksheet, j As Long, k As Long, m As Long, oMatches2
Dim reg, oMatches3, reg2
i = FreeFile
'strFile = "c:\Users\Richard\Documents\Htmltest.html"
lngTxt = FileLen(strFile)
strTxt = Space(lngTxt)
Open strFile For Binary Access Read As #i
Get #i, , strTxt
Close #i
Debug.Print strTxt
With CreateObject("vbscript.regexp")
.Global = True
.ignorecase = True
.Pattern = vbCrLf & ".*?title.*?(?=" & vbCrLf & ")"
If .test(strTxt) Then
Set oMatches = .Execute(strTxt)
For i = 0 To oMatches.Count - 1
Set reg = CreateObject("vbscript.regexp")
With reg
.Global = True
.ignorecase = True
.Pattern = "<title\""(.*?)\"""
k = Cells(Rows.Count, 1).End(xlUp).Offset(1).Row
Cells(k, 1).Value = strFile
If .test(oMatches(i)) Then
Set oMatches2 = .Execute(oMatches(i))
For j = 0 To oMatches2.Count - 1
Cells(k, j + 3) = .Replace(oMatches2(j), "$1")
Next j
End If
End With
Next i
End If
End With
End Sub



Mark[_66_]

Getting <Title out of HTML source code
 
On Apr 24, 10:02*am, "Rick Rothstein \(MVP - VB\)"
wrote:
I'm not all that familiar with HTML source code... can there be more than
one <title, </title pair in a file? If no, then this function will return
that single title...

Function GetTitle(TextToSearch As String) As String
* GetTitle = Split(Split(TextToSearch, "<title", , vbTextCompare)(1), _
* * * * * * * * * * * * * * * * * * * "</title", , vbTextCompare)(0)
End Function

If, on the other hand, there can be more than one, then this function will
return a zero-based array containing all the found titles...

Function GetTitle(TextToSearch As String) As String()
* Dim X As Long
* Dim Titles() As String
* Titles = Split(TextToSearch, "<title", , vbTextCompare)
* For X = 1 To UBound(Titles)
* * Titles(X - 1) = Split(Titles(X), "</title", , vbTextCompare)(0)
* Next
* ReDim Preserve Titles(UBound(Titles) - 1)
* GetTitle = Titles
End Function

Rick

"Mark" wrote in message

...



Hello


I was hoping to get some tips on how to get what is in between
<Title....</Title tags in a text file containing HTML source code
into column 3. The VBA code searches through many folders and
subfolders finding any file that is .html and reading it. I tried
parsing but can't seem to get it to work. Any ideas? Here is the code
that I have (that doesn't work)


Sub CheckTextFilesForHREFs()
MsgBox "Press OK to begin report"
Dim WholeLine As String
Dim myPath As String
Dim workfile As String
Dim myR As Long


myPath = "C:\Exelon\"
workfile = Dir(myPath & "*.html")
'sLine = WholeLine


Set fs = Application.FileSearch
With fs
.LookIn = "C:\Exelon"
.Filename = ".html"
.SearchSubFolders = True
'.FileType = mosFileTypeAllFiles
If .Execute(SortBy:=msoSortByFileName, _
SortOrder:=msoSortOrderAscending) 0 Then
MsgBox "There were " & .FoundFiles.Count & _
" file(s) found."
For i = 1 To .FoundFiles.Count
ParseURL .FoundFiles(i)
ParseTitle .FoundFiles(i)
ParseLink .FoundFiles(i)
Next i


Else
MsgBox "There were no files found."
End If
End With


Sub ParseTitle(strFile As String)
Dim strTxt As String, lngTxt As Long, i As Long, oMatches
Dim ws As Worksheet, j As Long, k As Long, m As Long, oMatches2
Dim reg, oMatches3, reg2
i = FreeFile
'strFile = "c:\Users\Richard\Documents\Htmltest.html"
lngTxt = FileLen(strFile)
strTxt = Space(lngTxt)
Open strFile For Binary Access Read As #i
Get #i, , strTxt
Close #i
Debug.Print strTxt
With CreateObject("vbscript.regexp")
.Global = True
.ignorecase = True
.Pattern = vbCrLf & ".*?title.*?(?=" & vbCrLf & ")"
If .test(strTxt) Then
Set oMatches = .Execute(strTxt)
For i = 0 To oMatches.Count - 1
Set reg = CreateObject("vbscript.regexp")
With reg
.Global = True
.ignorecase = True
.Pattern = "<title\""(.*?)\"""
k = Cells(Rows.Count, 1).End(xlUp).Offset(1).Row
Cells(k, 1).Value = strFile
If .test(oMatches(i)) Then
Set oMatches2 = .Execute(oMatches(i))
For j = 0 To oMatches2.Count - 1
Cells(k, j + 3) = .Replace(oMatches2(j), "$1")
Next j
End If
End With
Next i
End If
End With
End Sub- Hide quoted text -


- Show quoted text -


Thanks. Would I just put beneathe the function then something
like .Cells(C:C) or something for it to put it into the column C?

Rick Rothstein \(MVP - VB\)[_1808_]

Getting <Title out of HTML source code
 
I'm not all that familiar with HTML source code... can there be more
than one <title, </title pair in a file? If no, then this function
will
return that single title...

Function GetTitle(TextToSearch As String) As String
GetTitle = Split(Split(TextToSearch, "<title", , vbTextCompare)(1), _
"</title", , vbTextCompare)(0)
End Function

If, on the other hand, there can be more than one, then this function
will
return a zero-based array containing all the found titles...

Function GetTitle(TextToSearch As String) As String()
Dim X As Long
Dim Titles() As String
Titles = Split(TextToSearch, "<title", , vbTextCompare)
For X = 1 To UBound(Titles)
Titles(X - 1) = Split(Titles(X), "</title", , vbTextCompare)(0)
Next
ReDim Preserve Titles(UBound(Titles) - 1)
GetTitle = Titles
End Function


Thanks. Would I just put beneathe the function then something
like .Cells(C:C) or something for it to put it into the column C?


I'm not sure I understand what you mean when you say "put beneath the
function"... it is a function (no different from something like Sin or Sqr),
you call it and use it results. Something like this for the single <title
occurrence...

.....
..... <<< Open file and get contents
..... <<< assume text is assigned to TextFromFile variable

Msgbox GetTitle(TextFromFile)

..... <<<rest of your code
.....

For the multi-occurrence of <title, you will need to assign the return from
the function to a dynamically declared array and then, as you would do with
any array, iterate that array to get each individual item in it.

Rick



All times are GMT +1. The time now is 09:22 PM.

Powered by vBulletin® Copyright ©2000 - 2024, Jelsoft Enterprises Ltd.
ExcelBanter.com